ljw / spider
a simple route
dev-master
2023-02-07 08:31 UTC
Requires
- php: >=7.0
- ext-json: *
- ext-mbstring: *
- guzzlehttp/guzzle: >6.0
This package is auto-updated.
Last update: 2025-04-07 13:30:56 UTC
README
install
composer require ljw/spider:dev-master
example
require __DIR__ . '/../vendor/autoload.php'; $host = 'http://www.ibookv.com/'; $config = [ 'entry' => $host, 'domains' => ['www.ibookv.com'], 'max_try_num' => 5, 'max_depth' => 0, 'task_num' => 1, // 'log_filename' => 'spider.log', 'log_level' => Ljw\Spider\Log::LEVEL_DEBUG, //日志等级 // 'log_show' => 1, //是否输出日志到控制台 // 'multi_num' => 5, //guzzle 并发请求,开启多任务时不建议开启 'interval' => [500, 1200], //请求间隔,一个数字 或者 数组指定最小最大间隔, 单位毫秒 'auto_add' => false, //是否自动解析页面所有a标签 'ask_continue' => 'clear', // clear 直接清空, continue 直接继续, ask 询问 'show_task_panel' => 1, //show task status 'guzzle' => [ 'verify' => false, //建议false,不校验https 'headers' => [ 'User-Agent' => 'ljw', 'Client-Ip' => '127.0.0.1', 'timeout' => 10 ] ], //代理数组 或者 闭包函数 'proxy' => [], //'proxy' => function($url_info){}, 'queue_redis' => 1, //是否使用redis, task_num >1 是强制使用 'redis' => [ 'host' => '127.0.0.1', 'port' => 6379, 'pwd' => '', 'database' => 6, 'prefix' => 'lll:', 'timeout' => 30, ], 'pages' => [ [ 'url' => 'http://www.ibookv.com/book/\d+\.html', 'selector' => '//*[contains(@class,"book-info")]//h1', 'only_one' => 1, 'callback' => function ($data, $url_info, $html, $spider) { // var_dump($data); } ], [ 'url' => 'http://www.ibookv.com/book/\d+\.html', 'selector' => [ [ 'name' => 'book_name', 'only_one' => 1, 'selector' => '//*[contains(@class,"book-info")]//h1', ], [ 'name' => 'author', 'only_one' => 1, 'selector' => '//*[contains(@class,"writer")]', ], [ 'name' => 'chapters', 'only_one' => 1, 'selector' => [ [ 'name' => 'chapter_title', 'only_one' => 0, 'selector' => '//*[@id="l2"]//ul//li/a' ], [ 'name' => 'chapter_url', 'only_one' => 0, 'selector' => '//*[@id="l2"]//ul//li/a/@href' ] ], ], ], 'only_one' => 1, 'callback' => function ($data, $url_info, $html, $spider) { // var_dump($data); } ], [ 'url' => 'http://www.ibookv.com/category/\d+\.html', 'selector_type' => 'regex', 'selector' => '%<li ><a href=".*?">(.*?)</a></li>%i', 'callback' => function ($data, $url_info, $html, $spider) { // var_dump($data); } ] ], 'reload_func' => function ($spider) { }, ]; $spider = new Ljw\Spider\Spider($config); //空队列时 $spider->empty_queue_func = function ($spider) { $spider->reset(); }; $spider->start();