maikelvanmaurik/schrapert

v0.0.3 2020-04-01 11:41 UTC

This package is auto-updated.

Last update: 2024-04-16 06:07:50 UTC


README

Schrapert is a scraping/crawler library which is inspired by scrapy. It makes use of React for various operations such as downloading requests and writing files.

Example of a simple spider:

namespace Crawl;
use Schrapert\Spider;
use Schrapert\Crawl\ResponseInterface;
use Schrapert\Http\ResponseInterface as HttpResponse;
use Schrapert\Http\Request as HttpRequest;
use DOMDocument;
use DOMXPath;
use DOMElement;
class BlogSpider extends Spider
{    
    public function parse(ResponseInterface $response)
    {
        if(!$response instanceof HttpResponse) {
            return;
        }
        $doc = new DOMDocument('1.0');
        $doc->loadHTML((string)$response->getBody());
        $xpath = new DOMXPath($doc);
        $nodes = $xpath->query('//a');
        foreach($nodes as $node) {
            /* @var $node DOMElement */
            $uri = $this->uri->join($node->getAttribute('href'), $response->getUri());
            yield new HttpRequest($uri);
        }
    }
}