wengooooo / blackspider
A complete web scraping toolkit for PHP
v0.26
2024-03-19 12:12 UTC
Requires
- php: ^8.0
- guzzlehttp/guzzle: ^7.4.5
- jakeasmith/http_build_url: ^1.0.1
- league/container: ^4.2
- monolog/monolog: >2.3
- psr/container: ^2.0
- psy/psysh: >0.11.1
- sebastian/version: >3.0
- spatie/robots-txt: ^2.0
- symfony/console: ^6.0
- symfony/css-selector: ^6.0
- symfony/dom-crawler: ^6.0
- symfony/event-dispatcher: ^6.0
- symfony/options-resolver: ^6.0
Requires (Dev)
- ergebnis/composer-normalize: ^2.15
- ergebnis/php-cs-fixer-config: ^3.0
- http-interop/http-factory-guzzle: ^1.2
- phpunit/phpunit: ^9.5
- psr/http-message: ^1.0.1
- roave/security-advisories: dev-latest
- slim/slim: ^4.8
- spatie/browsershot: ^3.52
- spatie/phpunit-watcher: ^1.23
- vimeo/psalm: ^4.23
Suggests
- spatie/browsershot: Required to execute Javascript in spiders
README
<?php require_once 'vendor/autoload.php'; use BlackSpider\Downloader\Middleware\RetryMiddleware; use BlackSpider\Downloader\Middleware\UserAgentMiddleware; use BlackSpider\Extensions\LoggerExtension; use BlackSpider\Http\Request; use BlackSpider\Http\Response; use BlackSpider\Spider\BasicSpider; use BlackSpider\Spider\Configuration\Overrides; class MySpider extends BasicSpider { public function parse(Response $response): \Generator { /***/ } /** @return Request[] */ protected function initialRequests(): array { $yesterday = (new DateTime('yesterday'))->format('Y/m/d'); return [ new Request( 'GET', "https://www.httpbin.org/user-agent", [$this, 'parse'] ), ]; } } \BlackSpider\BlackSpider::startSpider( MySpider::class, new Overrides( startUrls: ['https://my-override-url.com'], downloaderMiddleware: [ UserAgentMiddleware::class, [RetryMiddleware::class, [ 'should_retry_callback' => function (?Response $response = null): bool { if (!$response) { return true; } if(str_contains($response->getBody(), 'user')) { return true; } return false; }, ] ], ], extensions: [ LoggerExtension::class ] ), );