Crawler.php 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. protected $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  54. *
  55. * @throws \InvalidArgumentException When node is not the expected type.
  56. *
  57. * @api
  58. */
  59. public function add($node)
  60. {
  61. if ($node instanceof \DOMNodeList) {
  62. $this->addNodeList($node);
  63. } elseif ($node instanceof \DOMNode) {
  64. $this->addNode($node);
  65. } elseif (is_array($node)) {
  66. $this->addNodes($node);
  67. } elseif (is_string($node)) {
  68. $this->addContent($node);
  69. } elseif (null !== $node) {
  70. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  71. }
  72. }
  73. /**
  74. * Adds HTML/XML content.
  75. *
  76. * If the charset is not set via the content type, it is assumed
  77. * to be ISO-8859-1, which is the default charset defined by the
  78. * HTTP 1.1 specification.
  79. *
  80. * @param string $content A string to parse as HTML/XML
  81. * @param null|string $type The content type of the string
  82. *
  83. * @return null|void
  84. */
  85. public function addContent($content, $type = null)
  86. {
  87. if (empty($type)) {
  88. $type = 'text/html';
  89. }
  90. // DOM only for HTML/XML content
  91. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  92. return null;
  93. }
  94. $charset = 'ISO-8859-1';
  95. if (false !== $pos = strpos($type, 'charset=')) {
  96. $charset = substr($type, $pos + 8);
  97. if (false !== $pos = strpos($charset, ';')) {
  98. $charset = substr($charset, 0, $pos);
  99. }
  100. }
  101. if ('x' === $matches[1]) {
  102. $this->addXmlContent($content, $charset);
  103. } else {
  104. $this->addHtmlContent($content, $charset);
  105. }
  106. }
  107. /**
  108. * Adds an HTML content to the list of nodes.
  109. *
  110. * The libxml errors are disabled when the content is parsed.
  111. *
  112. * If you want to get parsing errors, be sure to enable
  113. * internal errors via libxml_use_internal_errors(true)
  114. * and then, get the errors via libxml_get_errors(). Be
  115. * sure to clear errors with libxml_clear_errors() afterward.
  116. *
  117. * @param string $content The HTML content
  118. * @param string $charset The charset
  119. *
  120. * @api
  121. */
  122. public function addHtmlContent($content, $charset = 'UTF-8')
  123. {
  124. $current = libxml_use_internal_errors(true);
  125. $disableEntities = libxml_disable_entity_loader(true);
  126. $dom = new \DOMDocument('1.0', $charset);
  127. $dom->validateOnParse = true;
  128. if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
  129. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  130. }
  131. @$dom->loadHTML($content);
  132. libxml_use_internal_errors($current);
  133. libxml_disable_entity_loader($disableEntities);
  134. $this->addDocument($dom);
  135. $base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
  136. $baseHref = current($base);
  137. if (count($base) && !empty($baseHref)) {
  138. if ($this->uri) {
  139. $linkNode = $dom->createElement('a');
  140. $linkNode->setAttribute('href', $baseHref);
  141. $link = new Link($linkNode, $this->uri);
  142. $this->uri = $link->getUri();
  143. } else {
  144. $this->uri = $baseHref;
  145. }
  146. }
  147. }
  148. /**
  149. * Adds an XML content to the list of nodes.
  150. *
  151. * The libxml errors are disabled when the content is parsed.
  152. *
  153. * If you want to get parsing errors, be sure to enable
  154. * internal errors via libxml_use_internal_errors(true)
  155. * and then, get the errors via libxml_get_errors(). Be
  156. * sure to clear errors with libxml_clear_errors() afterward.
  157. *
  158. * @param string $content The XML content
  159. * @param string $charset The charset
  160. *
  161. * @api
  162. */
  163. public function addXmlContent($content, $charset = 'UTF-8')
  164. {
  165. $current = libxml_use_internal_errors(true);
  166. $disableEntities = libxml_disable_entity_loader(true);
  167. $dom = new \DOMDocument('1.0', $charset);
  168. $dom->validateOnParse = true;
  169. // remove the default namespace to make XPath expressions simpler
  170. @$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
  171. libxml_use_internal_errors($current);
  172. libxml_disable_entity_loader($disableEntities);
  173. $this->addDocument($dom);
  174. }
  175. /**
  176. * Adds a \DOMDocument to the list of nodes.
  177. *
  178. * @param \DOMDocument $dom A \DOMDocument instance
  179. *
  180. * @api
  181. */
  182. public function addDocument(\DOMDocument $dom)
  183. {
  184. if ($dom->documentElement) {
  185. $this->addNode($dom->documentElement);
  186. }
  187. }
  188. /**
  189. * Adds a \DOMNodeList to the list of nodes.
  190. *
  191. * @param \DOMNodeList $nodes A \DOMNodeList instance
  192. *
  193. * @api
  194. */
  195. public function addNodeList(\DOMNodeList $nodes)
  196. {
  197. foreach ($nodes as $node) {
  198. $this->addNode($node);
  199. }
  200. }
  201. /**
  202. * Adds an array of \DOMNode instances to the list of nodes.
  203. *
  204. * @param \DOMNode[] $nodes An array of \DOMNode instances
  205. *
  206. * @api
  207. */
  208. public function addNodes(array $nodes)
  209. {
  210. foreach ($nodes as $node) {
  211. $this->add($node);
  212. }
  213. }
  214. /**
  215. * Adds a \DOMNode instance to the list of nodes.
  216. *
  217. * @param \DOMNode $node A \DOMNode instance
  218. *
  219. * @api
  220. */
  221. public function addNode(\DOMNode $node)
  222. {
  223. if ($node instanceof \DOMDocument) {
  224. $this->attach($node->documentElement);
  225. } else {
  226. $this->attach($node);
  227. }
  228. }
  229. /**
  230. * Returns a node given its position in the node list.
  231. *
  232. * @param integer $position The position
  233. *
  234. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  235. *
  236. * @api
  237. */
  238. public function eq($position)
  239. {
  240. foreach ($this as $i => $node) {
  241. if ($i == $position) {
  242. return new static($node, $this->uri);
  243. }
  244. }
  245. return new static(null, $this->uri);
  246. }
  247. /**
  248. * Calls an anonymous function on each node of the list.
  249. *
  250. * The anonymous function receives the position and the node wrapped
  251. * in a Crawler instance as arguments.
  252. *
  253. * Example:
  254. *
  255. * $crawler->filter('h1')->each(function ($node, $i) {
  256. * return $node->text();
  257. * });
  258. *
  259. * @param \Closure $closure An anonymous function
  260. *
  261. * @return array An array of values returned by the anonymous function
  262. *
  263. * @api
  264. */
  265. public function each(\Closure $closure)
  266. {
  267. $data = array();
  268. foreach ($this as $i => $node) {
  269. $data[] = $closure(new static($node, $this->uri), $i);
  270. }
  271. return $data;
  272. }
  273. /**
  274. * Reduces the list of nodes by calling an anonymous function.
  275. *
  276. * To remove a node from the list, the anonymous function must return false.
  277. *
  278. * @param \Closure $closure An anonymous function
  279. *
  280. * @return Crawler A Crawler instance with the selected nodes.
  281. *
  282. * @api
  283. */
  284. public function reduce(\Closure $closure)
  285. {
  286. $nodes = array();
  287. foreach ($this as $i => $node) {
  288. if (false !== $closure(new static($node, $this->uri), $i)) {
  289. $nodes[] = $node;
  290. }
  291. }
  292. return new static($nodes, $this->uri);
  293. }
  294. /**
  295. * Returns the first node of the current selection
  296. *
  297. * @return Crawler A Crawler instance with the first selected node
  298. *
  299. * @api
  300. */
  301. public function first()
  302. {
  303. return $this->eq(0);
  304. }
  305. /**
  306. * Returns the last node of the current selection
  307. *
  308. * @return Crawler A Crawler instance with the last selected node
  309. *
  310. * @api
  311. */
  312. public function last()
  313. {
  314. return $this->eq(count($this) - 1);
  315. }
  316. /**
  317. * Returns the siblings nodes of the current selection
  318. *
  319. * @return Crawler A Crawler instance with the sibling nodes
  320. *
  321. * @throws \InvalidArgumentException When current node is empty
  322. *
  323. * @api
  324. */
  325. public function siblings()
  326. {
  327. if (!count($this)) {
  328. throw new \InvalidArgumentException('The current node list is empty.');
  329. }
  330. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  331. }
  332. /**
  333. * Returns the next siblings nodes of the current selection
  334. *
  335. * @return Crawler A Crawler instance with the next sibling nodes
  336. *
  337. * @throws \InvalidArgumentException When current node is empty
  338. *
  339. * @api
  340. */
  341. public function nextAll()
  342. {
  343. if (!count($this)) {
  344. throw new \InvalidArgumentException('The current node list is empty.');
  345. }
  346. return new static($this->sibling($this->getNode(0)), $this->uri);
  347. }
  348. /**
  349. * Returns the previous sibling nodes of the current selection
  350. *
  351. * @return Crawler A Crawler instance with the previous sibling nodes
  352. *
  353. * @throws \InvalidArgumentException
  354. *
  355. * @api
  356. */
  357. public function previousAll()
  358. {
  359. if (!count($this)) {
  360. throw new \InvalidArgumentException('The current node list is empty.');
  361. }
  362. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  363. }
  364. /**
  365. * Returns the parents nodes of the current selection
  366. *
  367. * @return Crawler A Crawler instance with the parents nodes of the current selection
  368. *
  369. * @throws \InvalidArgumentException When current node is empty
  370. *
  371. * @api
  372. */
  373. public function parents()
  374. {
  375. if (!count($this)) {
  376. throw new \InvalidArgumentException('The current node list is empty.');
  377. }
  378. $node = $this->getNode(0);
  379. $nodes = array();
  380. while ($node = $node->parentNode) {
  381. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  382. $nodes[] = $node;
  383. }
  384. }
  385. return new static($nodes, $this->uri);
  386. }
  387. /**
  388. * Returns the children nodes of the current selection
  389. *
  390. * @return Crawler A Crawler instance with the children nodes
  391. *
  392. * @throws \InvalidArgumentException When current node is empty
  393. *
  394. * @api
  395. */
  396. public function children()
  397. {
  398. if (!count($this)) {
  399. throw new \InvalidArgumentException('The current node list is empty.');
  400. }
  401. $node = $this->getNode(0)->firstChild;
  402. return new static($node ? $this->sibling($node) : array(), $this->uri);
  403. }
  404. /**
  405. * Returns the attribute value of the first node of the list.
  406. *
  407. * @param string $attribute The attribute name
  408. *
  409. * @return string The attribute value
  410. *
  411. * @throws \InvalidArgumentException When current node is empty
  412. *
  413. * @api
  414. */
  415. public function attr($attribute)
  416. {
  417. if (!count($this)) {
  418. throw new \InvalidArgumentException('The current node list is empty.');
  419. }
  420. return $this->getNode(0)->getAttribute($attribute);
  421. }
  422. /**
  423. * Returns the node value of the first node of the list.
  424. *
  425. * @return string The node value
  426. *
  427. * @throws \InvalidArgumentException When current node is empty
  428. *
  429. * @api
  430. */
  431. public function text()
  432. {
  433. if (!count($this)) {
  434. throw new \InvalidArgumentException('The current node list is empty.');
  435. }
  436. return $this->getNode(0)->nodeValue;
  437. }
  438. /**
  439. * Returns the first node of the list as HTML.
  440. *
  441. * @return string The node html
  442. *
  443. * @throws \InvalidArgumentException When current node is empty
  444. */
  445. public function html()
  446. {
  447. if (!count($this)) {
  448. throw new \InvalidArgumentException('The current node list is empty.');
  449. }
  450. $html = '';
  451. foreach ($this->getNode(0)->childNodes as $child) {
  452. if (version_compare(PHP_VERSION, '5.3.6', '>=')) {
  453. // node parameter was added to the saveHTML() method in PHP 5.3.6
  454. // @see http://php.net/manual/en/domdocument.savehtml.php
  455. $html .= $child->ownerDocument->saveHTML($child);
  456. } else {
  457. $document = new \DOMDocument('1.0', 'UTF-8');
  458. $document->appendChild($document->importNode($child, true));
  459. $html .= rtrim($document->saveHTML());
  460. }
  461. }
  462. return $html;
  463. }
  464. /**
  465. * Extracts information from the list of nodes.
  466. *
  467. * You can extract attributes or/and the node value (_text).
  468. *
  469. * Example:
  470. *
  471. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  472. *
  473. * @param array $attributes An array of attributes
  474. *
  475. * @return array An array of extracted values
  476. *
  477. * @api
  478. */
  479. public function extract($attributes)
  480. {
  481. $attributes = (array) $attributes;
  482. $data = array();
  483. foreach ($this as $node) {
  484. $elements = array();
  485. foreach ($attributes as $attribute) {
  486. if ('_text' === $attribute) {
  487. $elements[] = $node->nodeValue;
  488. } else {
  489. $elements[] = $node->getAttribute($attribute);
  490. }
  491. }
  492. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  493. }
  494. return $data;
  495. }
  496. /**
  497. * Filters the list of nodes with an XPath expression.
  498. *
  499. * @param string $xpath An XPath expression
  500. *
  501. * @return Crawler A new instance of Crawler with the filtered list of nodes
  502. *
  503. * @api
  504. */
  505. public function filterXPath($xpath)
  506. {
  507. $document = new \DOMDocument('1.0', 'UTF-8');
  508. $root = $document->appendChild($document->createElement('_root'));
  509. foreach ($this as $node) {
  510. $root->appendChild($document->importNode($node, true));
  511. }
  512. $domxpath = new \DOMXPath($document);
  513. return new static($domxpath->query($xpath), $this->uri);
  514. }
  515. /**
  516. * Filters the list of nodes with a CSS selector.
  517. *
  518. * This method only works if you have installed the CssSelector Symfony Component.
  519. *
  520. * @param string $selector A CSS selector
  521. *
  522. * @return Crawler A new instance of Crawler with the filtered list of nodes
  523. *
  524. * @throws \RuntimeException if the CssSelector Component is not available
  525. *
  526. * @api
  527. */
  528. public function filter($selector)
  529. {
  530. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  531. // @codeCoverageIgnoreStart
  532. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  533. // @codeCoverageIgnoreEnd
  534. }
  535. return $this->filterXPath(CssSelector::toXPath($selector));
  536. }
  537. /**
  538. * Selects links by name or alt value for clickable images.
  539. *
  540. * @param string $value The link text
  541. *
  542. * @return Crawler A new instance of Crawler with the filtered list of nodes
  543. *
  544. * @api
  545. */
  546. public function selectLink($value)
  547. {
  548. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  549. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  550. return $this->filterXPath($xpath);
  551. }
  552. /**
  553. * Selects a button by name or alt value for images.
  554. *
  555. * @param string $value The button text
  556. *
  557. * @return Crawler A new instance of Crawler with the filtered list of nodes
  558. *
  559. * @api
  560. */
  561. public function selectButton($value)
  562. {
  563. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  564. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  565. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  566. return $this->filterXPath($xpath);
  567. }
  568. /**
  569. * Returns a Link object for the first node in the list.
  570. *
  571. * @param string $method The method for the link (get by default)
  572. *
  573. * @return Link A Link instance
  574. *
  575. * @throws \InvalidArgumentException If the current node list is empty
  576. *
  577. * @api
  578. */
  579. public function link($method = 'get')
  580. {
  581. if (!count($this)) {
  582. throw new \InvalidArgumentException('The current node list is empty.');
  583. }
  584. $node = $this->getNode(0);
  585. return new Link($node, $this->uri, $method);
  586. }
  587. /**
  588. * Returns an array of Link objects for the nodes in the list.
  589. *
  590. * @return Link[] An array of Link instances
  591. *
  592. * @api
  593. */
  594. public function links()
  595. {
  596. $links = array();
  597. foreach ($this as $node) {
  598. $links[] = new Link($node, $this->uri, 'get');
  599. }
  600. return $links;
  601. }
  602. /**
  603. * Returns a Form object for the first node in the list.
  604. *
  605. * @param array $values An array of values for the form fields
  606. * @param string $method The method for the form
  607. *
  608. * @return Form A Form instance
  609. *
  610. * @throws \InvalidArgumentException If the current node list is empty
  611. *
  612. * @api
  613. */
  614. public function form(array $values = null, $method = null)
  615. {
  616. if (!count($this)) {
  617. throw new \InvalidArgumentException('The current node list is empty.');
  618. }
  619. $form = new Form($this->getNode(0), $this->uri, $method);
  620. if (null !== $values) {
  621. $form->setValues($values);
  622. }
  623. return $form;
  624. }
  625. /**
  626. * Converts string for XPath expressions.
  627. *
  628. * Escaped characters are: quotes (") and apostrophe (').
  629. *
  630. * Examples:
  631. * <code>
  632. * echo Crawler::xpathLiteral('foo " bar');
  633. * //prints 'foo " bar'
  634. *
  635. * echo Crawler::xpathLiteral("foo ' bar");
  636. * //prints "foo ' bar"
  637. *
  638. * echo Crawler::xpathLiteral('a\'b"c');
  639. * //prints concat('a', "'", 'b"c')
  640. * </code>
  641. *
  642. * @param string $s String to be escaped
  643. *
  644. * @return string Converted string
  645. */
  646. public static function xpathLiteral($s)
  647. {
  648. if (false === strpos($s, "'")) {
  649. return sprintf("'%s'", $s);
  650. }
  651. if (false === strpos($s, '"')) {
  652. return sprintf('"%s"', $s);
  653. }
  654. $string = $s;
  655. $parts = array();
  656. while (true) {
  657. if (false !== $pos = strpos($string, "'")) {
  658. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  659. $parts[] = "\"'\"";
  660. $string = substr($string, $pos + 1);
  661. } else {
  662. $parts[] = "'$string'";
  663. break;
  664. }
  665. }
  666. return sprintf("concat(%s)", implode($parts, ', '));
  667. }
  668. protected function getNode($position)
  669. {
  670. foreach ($this as $i => $node) {
  671. if ($i == $position) {
  672. return $node;
  673. }
  674. // @codeCoverageIgnoreStart
  675. }
  676. return null;
  677. // @codeCoverageIgnoreEnd
  678. }
  679. protected function sibling($node, $siblingDir = 'nextSibling')
  680. {
  681. $nodes = array();
  682. do {
  683. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  684. $nodes[] = $node;
  685. }
  686. } while ($node = $node->$siblingDir);
  687. return $nodes;
  688. }
  689. }