Utf8.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. <?php // vi: set fenc=utf-8 ts=4 sw=4 et:
  2. /*
  3. * Copyright (C) 2013 Nicolas Grekas - p@tchwork.com
  4. *
  5. * This library is free software; you can redistribute it and/or modify it
  6. * under the terms of the (at your option):
  7. * Apache License v2.0 (http://apache.org/licenses/LICENSE-2.0.txt), or
  8. * GNU General Public License v2.0 (http://gnu.org/licenses/gpl-2.0.txt).
  9. */
  10. namespace Patchwork;
  11. use Normalizer as n;
  12. /**
  13. * UTF-8 Grapheme Cluster aware string manipulations implementing the quasi complete
  14. * set of native PHP string functions that need UTF-8 awareness and more.
  15. * Missing are printf-family functions.
  16. */
  17. class Utf8
  18. {
  19. protected static
  20. $commonCaseFold = array(
  21. array('µ','ſ',"\xCD\x85",'ς',"\xCF\x90","\xCF\x91","\xCF\x95","\xCF\x96","\xCF\xB0","\xCF\xB1","\xCF\xB5","\xE1\xBA\x9B","\xE1\xBE\xBE"),
  22. array('μ','s','ι', 'σ','β', 'θ', 'φ', 'π', 'κ', 'ρ', 'ε', "\xE1\xB9\xA1",'ι' )
  23. ),
  24. $cp1252 = array('€','‚','ƒ','„','…','†','‡','ˆ','‰','Š','‹','Œ','Ž','‘','’','“','”','•','–','—','˜','™','š','›','œ','ž','Ÿ'),
  25. $utf8 = array('€','‚','ƒ','„','…','†','‡','ˆ','‰','Š','‹','Œ','Ž','‘','’','“','”','•','–','—','˜','™','š','›','œ','ž','Ÿ');
  26. static function isUtf8($s)
  27. {
  28. return (bool) preg_match('//u', $s); // Since PHP 5.2.5, this also excludes invalid five and six bytes sequences
  29. }
  30. // Generic UTF-8 to ASCII transliteration
  31. static function toAscii($s)
  32. {
  33. if (preg_match("/[\x80-\xFF]/", $s))
  34. {
  35. static $translitExtra = false;
  36. $translitExtra or $translitExtra = self::getData('translit_extra');
  37. $s = n::normalize($s, n::NFKD);
  38. $s = preg_replace('/\p{Mn}+/u', '', $s);
  39. $s = str_replace($translitExtra[0], $translitExtra[1], $s);
  40. $s = iconv('UTF-8', 'ASCII' . ('glibc' !== ICONV_IMPL ? '//IGNORE' : '') . '//TRANSLIT', $s);
  41. }
  42. return $s;
  43. }
  44. // Unicode transformation for caseless matching
  45. // see http://unicode.org/reports/tr21/tr21-5.html
  46. static function strtocasefold($s, $full = true, $turkish = false)
  47. {
  48. $s = str_replace(self::$commonCaseFold[0], self::$commonCaseFold[1], $s);
  49. if ($turkish)
  50. {
  51. false !== strpos($s, 'I') && $s = str_replace('I', 'ı', $s);
  52. $full && false !== strpos($s, 'İ') && $s = str_replace('İ', 'i', $s);
  53. }
  54. if ($full)
  55. {
  56. static $fullCaseFold = false;
  57. $fullCaseFold || $fullCaseFold = self::getData('caseFolding_full');
  58. $s = str_replace($fullCaseFold[0], $fullCaseFold[1], $s);
  59. }
  60. return self::strtolower($s);
  61. }
  62. // Generic case sensitive collation support for self::strnatcmp()
  63. static function strtonatfold($s)
  64. {
  65. $s = n::normalize($s, n::NFD);
  66. return preg_replace('/\p{Mn}+/u', '', $s);
  67. }
  68. // PHP string functions that need UTF-8 awareness
  69. static function substr($s, $start, $len = 2147483647)
  70. {
  71. /**/ static $bug62759;
  72. /**/ isset($bug62759) or $bug62759 = extension_loaded('intl') && 'à' === grapheme_substr('éà', 1, -2);
  73. /**/ if ($bug62759)
  74. /**/ {
  75. return PHP\Shim\Intl::grapheme_substr_workaround62759($s, $start, $len);
  76. /**/ }
  77. /**/ else
  78. /**/ {
  79. return grapheme_substr($s, $start, $len);
  80. /**/ }
  81. }
  82. static function strlen($s) {return grapheme_strlen($s);}
  83. static function strpos ($s, $needle, $offset = 0) {return grapheme_strpos ($s, $needle, $offset);}
  84. static function strrpos($s, $needle, $offset = 0) {return grapheme_strrpos($s, $needle, $offset);}
  85. static function stripos($s, $needle, $offset = 0)
  86. {
  87. /**/ if (50418 > PHP_VERSION_ID || 50500 == PHP_VERSION_ID)
  88. /**/ {
  89. // Don't use grapheme_stripos because of https://bugs.php.net/61860
  90. if ($offset < 0) $offset = 0;
  91. if (!$needle = mb_stripos($s, $needle, $offset, 'UTF-8')) return $needle;
  92. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  93. /**/ }
  94. /**/ else
  95. /**/ {
  96. return grapheme_stripos($s, $needle, $offset);
  97. /**/ }
  98. }
  99. static function strripos($s, $needle, $offset = 0)
  100. {
  101. /**/ if (50418 > PHP_VERSION_ID || 50500 == PHP_VERSION_ID)
  102. /**/ {
  103. // Don't use grapheme_strripos because of https://bugs.php.net/61860
  104. if ($offset < 0) $offset = 0;
  105. if (!$needle = mb_strripos($s, $needle, $offset, 'UTF-8')) return $needle;
  106. return grapheme_strlen(iconv_substr($s, 0, $needle, 'UTF-8'));
  107. /**/ }
  108. /**/ else
  109. /**/ {
  110. return grapheme_strripos($s, $needle, $offset);
  111. /**/ }
  112. }
  113. static function stristr($s, $needle, $before_needle = false)
  114. {
  115. if ('' === (string) $needle) return false;
  116. return mb_stristr($s, $needle, $before_needle, 'UTF-8');
  117. }
  118. static function strstr ($s, $needle, $before_needle = false) {return grapheme_strstr($s, $needle, $before_needle);}
  119. static function strrchr ($s, $needle, $before_needle = false) {return mb_strrchr ($s, $needle, $before_needle, 'UTF-8');}
  120. static function strrichr($s, $needle, $before_needle = false) {return mb_strrichr($s, $needle, $before_needle, 'UTF-8');}
  121. static function strtolower($s, $form = n::NFC) {if (n::isNormalized($s = mb_strtolower($s, 'UTF-8'), $form)) return $s; return n::normalize($s, $form);}
  122. static function strtoupper($s, $form = n::NFC) {if (n::isNormalized($s = mb_strtoupper($s, 'UTF-8'), $form)) return $s; return n::normalize($s, $form);}
  123. static function wordwrap($s, $width = 75, $break = "\n", $cut = false)
  124. {
  125. // This implementation could be extended to handle unicode word boundaries,
  126. // but that's enough work for today (see http://www.unicode.org/reports/tr29/)
  127. $width = (int) $width;
  128. $s = explode($break, $s);
  129. $iLen = count($s);
  130. $result = array();
  131. $line = '';
  132. $lineLen = 0;
  133. for ($i = 0; $i < $iLen; ++$i)
  134. {
  135. $words = explode(' ', $s[$i]);
  136. $line && $result[] = $line;
  137. $lineLen = grapheme_strlen($line);
  138. $jLen = count($words);
  139. for ($j = 0; $j < $jLen; ++$j)
  140. {
  141. $w = $words[$j];
  142. $wLen = grapheme_strlen($w);
  143. if ($lineLen + $wLen < $width)
  144. {
  145. if ($j) $line .= ' ';
  146. $line .= $w;
  147. $lineLen += $wLen + 1;
  148. }
  149. else
  150. {
  151. if ($j || $i) $result[] = $line;
  152. $line = '';
  153. $lineLen = 0;
  154. if ($cut && $wLen > $width)
  155. {
  156. $w = self::str_split($w);
  157. do
  158. {
  159. $result[] = implode('', array_slice($w, 0, $width));
  160. $line = implode('', $w = array_slice($w, $width));
  161. $lineLen = $wLen -= $width;
  162. }
  163. while ($wLen > $width);
  164. $w = implode('', $w);
  165. }
  166. $line = $w;
  167. $lineLen = $wLen;
  168. }
  169. }
  170. }
  171. $line && $result[] = $line;
  172. return implode($break, $result);
  173. }
  174. static function chr($c)
  175. {
  176. if (0x80 > $c %= 0x200000) return chr($c);
  177. if (0x800 > $c) return chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F);
  178. if (0x10000 > $c) return chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
  179. return chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F);
  180. }
  181. static function count_chars($s, $mode = 0)
  182. {
  183. if (1 != $mode) user_error(__METHOD__ . '(): the only allowed $mode is 1', E_USER_WARNING);
  184. $s = self::str_split($s);
  185. return array_count_values($s);
  186. }
  187. static function ltrim($s, $charlist = INF)
  188. {
  189. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  190. return preg_replace("/^{$charlist}+/u", '', $s);
  191. }
  192. static function ord($s)
  193. {
  194. $a = ($s = unpack('C*', substr($s, 0, 4))) ? $s[1] : 0;
  195. if (0xF0 <= $a) return (($a - 0xF0)<<18) + (($s[2] - 0x80)<<12) + (($s[3] - 0x80)<<6) + $s[4] - 0x80;
  196. if (0xE0 <= $a) return (($a - 0xE0)<<12) + (($s[2] - 0x80)<<6) + $s[3] - 0x80;
  197. if (0xC0 <= $a) return (($a - 0xC0)<<6) + $s[2] - 0x80;
  198. return $a;
  199. }
  200. static function rtrim($s, $charlist = INF)
  201. {
  202. $charlist = INF === $charlist ? '\s' : self::rxClass($charlist);
  203. return preg_replace("/{$charlist}+$/u", '', $s);
  204. }
  205. static function trim($s, $charlist = INF) {return self::rtrim(self::ltrim($s, $charlist), $charlist);}
  206. static function str_ireplace($search, $replace, $subject, &$count = null)
  207. {
  208. $search = (array) $search;
  209. foreach ($search as &$s) $s = '' !== (string) $s ? '/' . preg_quote($s, '/') . '/ui' : '/^(?<=.)$/';
  210. $subject = preg_replace($search, $replace, $subject, -1, $replace);
  211. $count = $replace;
  212. return $subject;
  213. }
  214. static function str_pad($s, $len, $pad = ' ', $type = STR_PAD_RIGHT)
  215. {
  216. $slen = grapheme_strlen($s);
  217. if ($len <= $slen) return $s;
  218. $padlen = grapheme_strlen($pad);
  219. $freelen = $len - $slen;
  220. $len = $freelen % $padlen;
  221. if (STR_PAD_RIGHT == $type) return $s . str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  222. if (STR_PAD_LEFT == $type) return str_repeat($pad, $freelen / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  223. if (STR_PAD_BOTH == $type)
  224. {
  225. $freelen /= 2;
  226. $type = ceil($freelen);
  227. $len = $type % $padlen;
  228. $s .= str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '');
  229. $type = floor($freelen);
  230. $len = $type % $padlen;
  231. return str_repeat($pad, $type / $padlen) . ($len ? grapheme_substr($pad, 0, $len) : '') . $s;
  232. }
  233. user_error(__METHOD__ . '(): Padding type has to be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH', E_USER_WARNING);
  234. }
  235. static function str_shuffle($s)
  236. {
  237. $s = self::str_split($s);
  238. shuffle($s);
  239. return implode('', $s);
  240. }
  241. static function str_split($s, $len = 1)
  242. {
  243. if (1 > $len = (int) $len)
  244. {
  245. $len = func_get_arg(1);
  246. return str_split($s, $len);
  247. }
  248. /**/ if (extension_loaded('intl'))
  249. /**/ {
  250. $a = array();
  251. $p = 0;
  252. $l = strlen($s);
  253. while ($p < $l) $a[] = grapheme_extract($s, 1, GRAPHEME_EXTR_COUNT, $p, $p);
  254. /**/ }
  255. /**/ else
  256. /**/ {
  257. preg_match_all('/' . GRAPHEME_CLUSTER_RX . '/u', $s, $a);
  258. $a = $a[0];
  259. /**/ }
  260. if (1 == $len) return $a;
  261. $s = array();
  262. $p = -1;
  263. foreach ($a as $l => $a)
  264. {
  265. if ($l % $len) $s[$p] .= $a;
  266. else $s[++$p] = $a;
  267. }
  268. return $s;
  269. }
  270. static function str_word_count($s, $format = 0, $charlist = '')
  271. {
  272. $charlist = self::rxClass($charlist, '\pL');
  273. $s = preg_split("/({$charlist}+(?:[\p{Pd}’']{$charlist}+)*)/u", $s, -1, PREG_SPLIT_DELIM_CAPTURE);
  274. $charlist = array();
  275. $len = count($s);
  276. if (1 == $format) for ($i = 1; $i < $len; $i+=2) $charlist[] = $s[$i];
  277. else if (2 == $format)
  278. {
  279. $offset = grapheme_strlen($s[0]);
  280. for ($i = 1; $i < $len; $i+=2)
  281. {
  282. $charlist[$offset] = $s[$i];
  283. $offset += grapheme_strlen($s[$i]) + grapheme_strlen($s[$i+1]);
  284. }
  285. }
  286. else $charlist = ($len - 1) / 2;
  287. return $charlist;
  288. }
  289. static function strcmp ($a, $b) {return (string) $a === (string) $b ? 0 : strcmp(n::normalize($a, n::NFD), n::normalize($b, n::NFD));}
  290. static function strnatcmp ($a, $b) {return (string) $a === (string) $b ? 0 : strnatcmp(self::strtonatfold($a), self::strtonatfold($b));}
  291. static function strcasecmp ($a, $b) {return self::strcmp (self::strtocasefold($a), self::strtocasefold($b));}
  292. static function strnatcasecmp($a, $b) {return self::strnatcmp(self::strtocasefold($a), self::strtocasefold($b));}
  293. static function strncasecmp ($a, $b, $len) {return self::strncmp(self::strtocasefold($a), self::strtocasefold($b), $len);}
  294. static function strncmp ($a, $b, $len) {return self::strcmp(self::substr($a, 0, $len), self::substr($b, 0, $len));}
  295. static function strcspn($s, $charlist, $start = 0, $len = 2147483647)
  296. {
  297. if ('' === (string) $charlist) return null;
  298. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  299. return preg_match('/^(.*?)' . self::rxClass($charlist) . '/us', $s, $len) ? grapheme_strlen($len[1]) : grapheme_strlen($s);
  300. }
  301. static function strpbrk($s, $charlist)
  302. {
  303. if (preg_match('/' . self::rxClass($charlist) . '/us', $s, $m)) return substr($s, strpos($s, $m[0]));
  304. else return false;
  305. }
  306. static function strrev($s)
  307. {
  308. $s = self::str_split($s);
  309. return implode('', array_reverse($s));
  310. }
  311. static function strspn($s, $mask, $start = 0, $len = 2147483647)
  312. {
  313. if ($start || 2147483647 != $len) $s = self::substr($s, $start, $len);
  314. return preg_match('/^' . self::rxClass($mask) . '+/u', $s, $s) ? grapheme_strlen($s[0]) : 0;
  315. }
  316. static function strtr($s, $from, $to = INF)
  317. {
  318. if (INF !== $to)
  319. {
  320. $from = self::str_split($from);
  321. $to = self::str_split($to);
  322. $a = count($from);
  323. $b = count($to);
  324. if ($a > $b) $from = array_slice($from, 0, $b);
  325. else if ($a < $b) $to = array_slice($to , 0, $a);
  326. $from = array_combine($from, $to);
  327. }
  328. return strtr($s, $from);
  329. }
  330. static function substr_compare($a, $b, $offset, $len = 2147483647, $i = 0)
  331. {
  332. $a = self::substr($a, $offset, $len);
  333. return $i ? self::strcasecmp($a, $b) : self::strcmp($a, $b);
  334. }
  335. static function substr_count($s, $needle, $offset = 0, $len = 2147483647)
  336. {
  337. return substr_count(self::substr($s, $offset, $len), $needle);
  338. }
  339. static function substr_replace($s, $replace, $start, $len = 2147483647)
  340. {
  341. $s = self::str_split($s);
  342. $replace = self::str_split($replace);
  343. array_splice($s, $start, $len, $replace);
  344. return implode('', $s);
  345. }
  346. static function ucfirst($s)
  347. {
  348. $c = iconv_substr($s, 0, 1, 'UTF-8');
  349. return self::ucwords($c) . substr($s, strlen($c));
  350. }
  351. static function lcfirst($s)
  352. {
  353. $c = iconv_substr($s, 0, 1, 'UTF-8');
  354. return mb_strtolower($c, 'UTF-8') . substr($s, strlen($c));
  355. }
  356. static function ucwords($s)
  357. {
  358. return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8');
  359. }
  360. static function number_format($number, $decimals = 0, $dec_point = '.', $thousands_sep = ',')
  361. {
  362. /**/ if (PHP_VERSION_ID < 50400)
  363. /**/ {
  364. if (isset($thousands_sep[1]) || isset($dec_point[1]))
  365. {
  366. return str_replace(
  367. array('.', ','),
  368. array($dec_point, $thousands_sep),
  369. number_format($number, $decimals, '.', ',')
  370. );
  371. }
  372. /**/ }
  373. return number_format($number, $decimals, $dec_point, $thousands_sep);
  374. }
  375. static function utf8_encode($s)
  376. {
  377. $s = utf8_encode($s);
  378. if (false === strpos($s, "\xC2")) return $s;
  379. else return str_replace(self::$cp1252, self::$utf8, $s);
  380. }
  381. static function utf8_decode($s)
  382. {
  383. $s = str_replace(self::$utf8, self::$cp1252, $s);
  384. return utf8_decode($s);
  385. }
  386. protected static function rxClass($s, $class = '')
  387. {
  388. $class = array($class);
  389. foreach (self::str_split($s) as $s)
  390. {
  391. if ('-' === $s) $class[0] = '-' . $class[0];
  392. else if (!isset($s[2])) $class[0] .= preg_quote($s, '/');
  393. else if (1 === iconv_strlen($s, 'UTF-8')) $class[0] .= $s;
  394. else $class[] = $s;
  395. }
  396. $class[0] = '[' . $class[0] . ']';
  397. if (1 === count($class)) return $class[0];
  398. else return '(?:' . implode('|', $class) . ')';
  399. }
  400. protected static function getData($file)
  401. {
  402. $file = __DIR__ . '/Utf8/data/' . $file . '.ser';
  403. if (file_exists($file)) return unserialize(file_get_contents($file));
  404. else return false;
  405. }
  406. }