skyblack / diz-scraping
Diz Scraping
0.1.3
2024-01-31 15:48 UTC
Requires
- php: >=7.4
- ext-curl: *
- ext-json: *
- skyblack/diz-toolkit: ^0.1.2
- wikimedia/composer-merge-plugin: ^2.0
Requires (Dev)
- phpunit/phpunit: ^9
This package is auto-updated.
Last update: 2024-10-30 01:54:12 UTC
README
A PHP kit for sending a request and receiving a response over HTTP using the native cURL library.
Simple example:
$crawler = new Crawler('sample.com'); // Create a crawler instance with a target site domain - sample.com
$content = $crawler->get('test'); // get a page at https://sample.com/test
$extractor = new Extractor($content); // Create an extractor instance for the result content
$item = $extractor->extract($regexp); // Extract the desired element from the text using some regexp
Advanced example:
// Create a crawler instance and add a JSON pipe
// The JSON response will be automatically converted to an array
$crawler = (new Crawler('sample.com'))->addJSONPipe();
// Set cookies to be used in the upcoming request
$crawler->addSimpleCookie('user-id', '123');
// Add a permanent custom header data
$crawler->addHeader('Token', 'zxy123gh');
// Create a request instance with a target URL - https://sample.com/post
$request = $crawler->newRequest('post');
// Switch request to POST method and JSON body type
$request->toPOST($data, DataType::JSON);
// Add a custom header data only to this request
$request->addHeader('X-Custom-Data', 'Foo Bar');
$result = $request->send();
// Get actual cookies
$cookies = $crawler->obtainCookies();
Although the crawler can be used as is, a more appropriate solution is class inheritance:
class MyCrawler extends Crawler
{
public function __construct()
{
parent::__construct('sample.com', 'my'); // https://my.sample.com
$this->addJSONPipe();
$this->addCallbackPipe([$this, 'checkResponse']);
}
// Add a callback pipe to check the status of the response from the server
public function checkResponse(array $result): array
{
if (($result['status'] ?? null) !== 'ok') {
throw new \Exception($result['message'] ?? null);
}
return $result['content'];
}
public function getItems(string $category): array
{
return $this->get('items', ['category' => $category]);
}
// Listen the download event to prevent downloading erroneous files
protected function onDownloadEvent(DownloadEvent $event): void
{
$response = $event->getResponse();
$status_code = $response->getStatusCode();
$size = $response->getDownloadSize();
$destination = $event->getDestination();
if ($status_code != 200 || $size == 0) {
unlink($destination);
if ($status_code == 404) {
throw new \Exception('File not found.');
} else {
throw new \Exception('File download error.');
}
}
}
}