wengoooo / haixun
There is no license information available for the latest version (v0.3) of this package.
v0.3
2019-09-21 06:26 UTC
Requires
- php: >7.0
- campo/random-user-agent: ^1.3
- guzzlehttp/guzzle: 6.3.3
- pleonasm/bloom-filter: ^1.0
- predis/predis: ^1.1
- symfony/css-selector: ^4.3
- symfony/dom-crawler: ^4.3
This package is auto-updated.
Last update: 2025-03-21 21:44:07 UTC
README
安装
环境要求
- PHP >= 7.0
- PHP cURL 扩展
- PHP OpenSSL 扩展
安装
使用 composer:
$ composer require wengoooo/haixun
快速开始
建立一个爬虫
require_once "vendor/autoload.php"; use GuzzleHttp\Psr7\Request; class TheBaseSpider extends \Haixun\Core\Spiders { public $maxPage = 1; public $currentPage = 1; public $userId; // public $startUrls = ['http://www.httpbin.org/get', 'http://www.httpbin.org/user-agent']; public function startRequests() { yield new Request("GET", "https://www.domain.com/categories/1735750"); } public function parse(Haixun\Http\Response $response, $index) { if (sizeof($response->css("#max_page")) > 0) { $this->maxPage = (int)$response->css("#max_page")->text(); $this->currentPage = 1; preg_match_all("%(user_[^']+)%", $response->getBodyContents(), $result, PREG_PATTERN_ORDER); $this->userId = $result[0][0]; } $uri = new \GuzzleHttp\Psr7\Uri($response->getCurrentUrl()); while ($this->currentPage++ <= $this->maxPage) { yield new Request("GET", sprintf("https://%s/load_items/categories/1735750/%s/%s/0", $uri->getHost(), $this->currentPage, $this->userId)); } foreach ($response->css(".item a[href*=items]")->links() as $link) { yield new Request("GET", $link->getUri(), ['meta' => ['callback' => 'parseProduct']]); } } public function parseProduct(Haixun\Http\Response $response, $index) { var_dump($response->css("h2.itemTitle")->text()); } public function finish() {} }
启动爬虫
$crawler = new \Haixun\Core\Crawler(new TheBaseSpider()); $crawler->crawl();
DomCrawler Crawler
实例化
$url = 'https://movie.douban.com/subject/25812712/?from=showing'; $response = file_get_contents($url); //进行XPath页面数据抽取 $data = []; //结构化数据存本数组 $crawler = new Crawler(); $crawler->addHtmlContent($response);
查找元素
# xpath $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text(); $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->html(); # css $crawler->filter('#content h1 span')->text(); $crawler->filter('#content h1 span')->html();
遍历元素
$crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) { $node->attr("class") # 获取属性 });
获取总数
$crawler->filter(".item a[href*=items]")->count();
遍历所有链接
foreach($crawler->filter(".item a[href*=items]")->links() as $link) { echo $link->getUri(); }