grab / spider
This package is abandoned and no longer maintained.
No replacement package was suggested.
PHP async scrapper used multi curl and reactphp, and proxy inspired by python grab
0.2
2017-05-10 14:43 UTC
Requires
- php: >=5.4.0
- imangazaliev/didom: *
- khr/react-curl: *
- react/event-loop: *
- react/promise: ^2.4
Requires (Dev)
- codeclimate/php-test-reporter: dev-master
- squizlabs/php_codesniffer: *
This package is not auto-updated.
Last update: 2020-08-26 22:38:54 UTC
README
PHP async scrapper used multi curl and reactphp inspired by python grab
Installation
To install grab-spider run the command:
composer require grab/spider "dev-master"
Quick start
<?php require __DIR__ . '/../vendor/autoload.php'; class HackerNewCrawler extends \Grab\Spider { public function taskGenerator() { $range = array_map(function($item) { return sprintf('https://news.ycombinator.com/news?p=%d', $item); }, range(1, 4)) ; foreach ($range as $url) { $this->task('page', [ 'url' => $url, 'max_request' => 10, ]); } } public function taskPage($parser, $task) { $links = $parser->find('.storylink'); foreach ($links as $link) { $this->task('topic', [ 'url' => $link->getAttribute('href'), 'curl_config' => [ CURLOPT_TIMEOUT => 60, ], 'max_request' => 10, ]); } } public function taskTopic($parser, $task) { $products = $parser->find('title'); echo trim($products[0]->text()) . PHP_EOL; } } $bot = new HackerNewCrawler(); $bot->debug = true; $bot->setCurlSetting([ CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', ]); //$bot->loadProxy(__DIR__ . '/proxy_list.txt'); $bot->run();
Simple DI from change parser
$parser = new \DiDom\Document(); $bot = new HackerNewCrawler([$parser, 'load']); $bot = new HackerNewCrawler(function ($content) { $parser = new \DiDom\Document(); return $parser->load($content); }); $bot = new HackerNewCrawler(function ($content) { return simplexml_load_string($content); }); $bot = new HackerNewCrawler(function ($content) { return new \SoapClient($content); });