Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 
 

388 wiersze
11 KiB

  1. <?php
  2. /**
  3. * IDNA URL encoder
  4. *
  5. * Note: Not fully compliant, as nameprep does nothing yet.
  6. *
  7. * @package Requests
  8. * @subpackage Utilities
  9. * @see https://tools.ietf.org/html/rfc3490 IDNA specification
  10. * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification
  11. */
  12. class Requests_IDNAEncoder {
  13. /**
  14. * ACE prefix used for IDNA
  15. *
  16. * @see https://tools.ietf.org/html/rfc3490#section-5
  17. * @var string
  18. */
  19. const ACE_PREFIX = 'xn--';
  20. /**#@+
  21. * Bootstrap constant for Punycode
  22. *
  23. * @see https://tools.ietf.org/html/rfc3492#section-5
  24. * @var int
  25. */
  26. const BOOTSTRAP_BASE = 36;
  27. const BOOTSTRAP_TMIN = 1;
  28. const BOOTSTRAP_TMAX = 26;
  29. const BOOTSTRAP_SKEW = 38;
  30. const BOOTSTRAP_DAMP = 700;
  31. const BOOTSTRAP_INITIAL_BIAS = 72;
  32. const BOOTSTRAP_INITIAL_N = 128;
  33. /**#@-*/
  34. /**
  35. * Encode a hostname using Punycode
  36. *
  37. * @param string $string Hostname
  38. * @return string Punycode-encoded hostname
  39. */
  40. public static function encode($string) {
  41. $parts = explode('.', $string);
  42. foreach ($parts as &$part) {
  43. $part = self::to_ascii($part);
  44. }
  45. return implode('.', $parts);
  46. }
  47. /**
  48. * Convert a UTF-8 string to an ASCII string using Punycode
  49. *
  50. * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`)
  51. * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`)
  52. * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`)
  53. * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`)
  54. *
  55. * @param string $string ASCII or UTF-8 string (max length 64 characters)
  56. * @return string ASCII string
  57. */
  58. public static function to_ascii($string) {
  59. // Step 1: Check if the string is already ASCII
  60. if (self::is_ascii($string)) {
  61. // Skip to step 7
  62. if (strlen($string) < 64) {
  63. return $string;
  64. }
  65. throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string);
  66. }
  67. // Step 2: nameprep
  68. $string = self::nameprep($string);
  69. // Step 3: UseSTD3ASCIIRules is false, continue
  70. // Step 4: Check if it's ASCII now
  71. if (self::is_ascii($string)) {
  72. // Skip to step 7
  73. if (strlen($string) < 64) {
  74. return $string;
  75. }
  76. throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string);
  77. }
  78. // Step 5: Check ACE prefix
  79. if (strpos($string, self::ACE_PREFIX) === 0) {
  80. throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string);
  81. }
  82. // Step 6: Encode with Punycode
  83. $string = self::punycode_encode($string);
  84. // Step 7: Prepend ACE prefix
  85. $string = self::ACE_PREFIX . $string;
  86. // Step 8: Check size
  87. if (strlen($string) < 64) {
  88. return $string;
  89. }
  90. throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string);
  91. }
  92. /**
  93. * Check whether a given string contains only ASCII characters
  94. *
  95. * @internal (Testing found regex was the fastest implementation)
  96. *
  97. * @param string $string
  98. * @return bool Is the string ASCII-only?
  99. */
  100. protected static function is_ascii($string) {
  101. return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1);
  102. }
  103. /**
  104. * Prepare a string for use as an IDNA name
  105. *
  106. * @todo Implement this based on RFC 3491 and the newer 5891
  107. * @param string $string
  108. * @return string Prepared string
  109. */
  110. protected static function nameprep($string) {
  111. return $string;
  112. }
  113. /**
  114. * Convert a UTF-8 string to a UCS-4 codepoint array
  115. *
  116. * Based on Requests_IRI::replace_invalid_with_pct_encoding()
  117. *
  118. * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`)
  119. * @param string $input
  120. * @return array Unicode code points
  121. */
  122. protected static function utf8_to_codepoints($input) {
  123. $codepoints = array();
  124. // Get number of bytes
  125. $strlen = strlen($input);
  126. for ($position = 0; $position < $strlen; $position++) {
  127. $value = ord($input[$position]);
  128. // One byte sequence:
  129. if ((~$value & 0x80) === 0x80) {
  130. $character = $value;
  131. $length = 1;
  132. $remaining = 0;
  133. }
  134. // Two byte sequence:
  135. elseif (($value & 0xE0) === 0xC0) {
  136. $character = ($value & 0x1F) << 6;
  137. $length = 2;
  138. $remaining = 1;
  139. }
  140. // Three byte sequence:
  141. elseif (($value & 0xF0) === 0xE0) {
  142. $character = ($value & 0x0F) << 12;
  143. $length = 3;
  144. $remaining = 2;
  145. }
  146. // Four byte sequence:
  147. elseif (($value & 0xF8) === 0xF0) {
  148. $character = ($value & 0x07) << 18;
  149. $length = 4;
  150. $remaining = 3;
  151. }
  152. // Invalid byte:
  153. else {
  154. throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
  155. }
  156. if ($remaining > 0) {
  157. if ($position + $length > $strlen) {
  158. throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
  159. }
  160. for ($position++; $remaining > 0; $position++) {
  161. $value = ord($input[$position]);
  162. // If it is invalid, count the sequence as invalid and reprocess the current byte:
  163. if (($value & 0xC0) !== 0x80) {
  164. throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
  165. }
  166. $character |= ($value & 0x3F) << (--$remaining * 6);
  167. }
  168. $position--;
  169. }
  170. if (
  171. // Non-shortest form sequences are invalid
  172. $length > 1 && $character <= 0x7F
  173. || $length > 2 && $character <= 0x7FF
  174. || $length > 3 && $character <= 0xFFFF
  175. // Outside of range of ucschar codepoints
  176. // Noncharacters
  177. || ($character & 0xFFFE) === 0xFFFE
  178. || $character >= 0xFDD0 && $character <= 0xFDEF
  179. || (
  180. // Everything else not in ucschar
  181. $character > 0xD7FF && $character < 0xF900
  182. || $character < 0x20
  183. || $character > 0x7E && $character < 0xA0
  184. || $character > 0xEFFFD
  185. )
  186. ) {
  187. throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
  188. }
  189. $codepoints[] = $character;
  190. }
  191. return $codepoints;
  192. }
  193. /**
  194. * RFC3492-compliant encoder
  195. *
  196. * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code
  197. * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`)
  198. *
  199. * @param string $input UTF-8 encoded string to encode
  200. * @return string Punycode-encoded string
  201. */
  202. public static function punycode_encode($input) {
  203. $output = '';
  204. # let n = initial_n
  205. $n = self::BOOTSTRAP_INITIAL_N;
  206. # let delta = 0
  207. $delta = 0;
  208. # let bias = initial_bias
  209. $bias = self::BOOTSTRAP_INITIAL_BIAS;
  210. # let h = b = the number of basic code points in the input
  211. $h = $b = 0; // see loop
  212. # copy them to the output in order
  213. $codepoints = self::utf8_to_codepoints($input);
  214. $extended = array();
  215. foreach ($codepoints as $char) {
  216. if ($char < 128) {
  217. // Character is valid ASCII
  218. // TODO: this should also check if it's valid for a URL
  219. $output .= chr($char);
  220. $h++;
  221. }
  222. // Check if the character is non-ASCII, but below initial n
  223. // This never occurs for Punycode, so ignore in coverage
  224. // @codeCoverageIgnoreStart
  225. elseif ($char < $n) {
  226. throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char);
  227. }
  228. // @codeCoverageIgnoreEnd
  229. else {
  230. $extended[$char] = true;
  231. }
  232. }
  233. $extended = array_keys($extended);
  234. sort($extended);
  235. $b = $h;
  236. # [copy them] followed by a delimiter if b > 0
  237. if (strlen($output) > 0) {
  238. $output .= '-';
  239. }
  240. # {if the input contains a non-basic code point < n then fail}
  241. # while h < length(input) do begin
  242. while ($h < count($codepoints)) {
  243. # let m = the minimum code point >= n in the input
  244. $m = array_shift($extended);
  245. //printf('next code point to insert is %s' . PHP_EOL, dechex($m));
  246. # let delta = delta + (m - n) * (h + 1), fail on overflow
  247. $delta += ($m - $n) * ($h + 1);
  248. # let n = m
  249. $n = $m;
  250. # for each code point c in the input (in order) do begin
  251. for ($num = 0; $num < count($codepoints); $num++) {
  252. $c = $codepoints[$num];
  253. # if c < n then increment delta, fail on overflow
  254. if ($c < $n) {
  255. $delta++;
  256. }
  257. # if c == n then begin
  258. elseif ($c === $n) {
  259. # let q = delta
  260. $q = $delta;
  261. # for k = base to infinity in steps of base do begin
  262. for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) {
  263. # let t = tmin if k <= bias {+ tmin}, or
  264. # tmax if k >= bias + tmax, or k - bias otherwise
  265. if ($k <= ($bias + self::BOOTSTRAP_TMIN)) {
  266. $t = self::BOOTSTRAP_TMIN;
  267. }
  268. elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) {
  269. $t = self::BOOTSTRAP_TMAX;
  270. }
  271. else {
  272. $t = $k - $bias;
  273. }
  274. # if q < t then break
  275. if ($q < $t) {
  276. break;
  277. }
  278. # output the code point for digit t + ((q - t) mod (base - t))
  279. $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t));
  280. $output .= self::digit_to_char($digit);
  281. # let q = (q - t) div (base - t)
  282. $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t));
  283. # end
  284. }
  285. # output the code point for digit q
  286. $output .= self::digit_to_char($q);
  287. # let bias = adapt(delta, h + 1, test h equals b?)
  288. $bias = self::adapt($delta, $h + 1, $h === $b);
  289. # let delta = 0
  290. $delta = 0;
  291. # increment h
  292. $h++;
  293. # end
  294. }
  295. # end
  296. }
  297. # increment delta and n
  298. $delta++;
  299. $n++;
  300. # end
  301. }
  302. return $output;
  303. }
  304. /**
  305. * Convert a digit to its respective character
  306. *
  307. * @see https://tools.ietf.org/html/rfc3492#section-5
  308. * @throws Requests_Exception On invalid digit (`idna.invalid_digit`)
  309. *
  310. * @param int $digit Digit in the range 0-35
  311. * @return string Single character corresponding to digit
  312. */
  313. protected static function digit_to_char($digit) {
  314. // @codeCoverageIgnoreStart
  315. // As far as I know, this never happens, but still good to be sure.
  316. if ($digit < 0 || $digit > 35) {
  317. throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit);
  318. }
  319. // @codeCoverageIgnoreEnd
  320. $digits = 'abcdefghijklmnopqrstuvwxyz0123456789';
  321. return substr($digits, $digit, 1);
  322. }
  323. /**
  324. * Adapt the bias
  325. *
  326. * @see https://tools.ietf.org/html/rfc3492#section-6.1
  327. * @param int $delta
  328. * @param int $numpoints
  329. * @param bool $firsttime
  330. * @return int New bias
  331. */
  332. protected static function adapt($delta, $numpoints, $firsttime) {
  333. # function adapt(delta,numpoints,firsttime):
  334. # if firsttime then let delta = delta div damp
  335. if ($firsttime) {
  336. $delta = floor($delta / self::BOOTSTRAP_DAMP);
  337. }
  338. # else let delta = delta div 2
  339. else {
  340. $delta = floor($delta / 2);
  341. }
  342. # let delta = delta + (delta div numpoints)
  343. $delta += floor($delta / $numpoints);
  344. # let k = 0
  345. $k = 0;
  346. # while delta > ((base - tmin) * tmax) div 2 do begin
  347. $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2);
  348. while ($delta > $max) {
  349. # let delta = delta div (base - tmin)
  350. $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN));
  351. # let k = k + base
  352. $k += self::BOOTSTRAP_BASE;
  353. # end
  354. }
  355. # return k + (((base - tmin + 1) * delta) div (delta + skew))
  356. return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW));
  357. }
  358. }