grab/spider

This package is abandoned and no longer maintained. No replacement package was suggested.

PHP async scrapper used multi curl and reactphp, and proxy inspired by python grab

Maintainers

Package info

github.com/strelov1/Spider

pkg:composer/grab/spider

Statistics

Installs: 118

Dependents: 0

Suggesters: 0

Stars: 11

Open Issues: 1

0.2 2017-05-10 14:43 UTC

This package is not auto-updated.

Last update: 2020-08-26 22:38:54 UTC


README

PHP async scrapper used multi curl and reactphp inspired by python grab

Installation

To install grab-spider run the command:

composer require grab/spider "dev-master" 
    

Quick start

<?php

require __DIR__ . '/../vendor/autoload.php';

class HackerNewCrawler extends \Grab\Spider
{
    public function taskGenerator()
    {
        $range = array_map(function($item) {
            return sprintf('https://news.ycombinator.com/news?p=%d', $item);
        }, range(1, 4)) ;

        foreach ($range as $url) {
            $this->task('page', [
                'url' => $url,
                'max_request' => 10,
            ]);
        }
    }

    public function taskPage($parser, $task)
    {
        $links = $parser->find('.storylink');
        foreach ($links as $link) {
            $this->task('topic', [
                'url' => $link->getAttribute('href'),
                'curl_config' => [
                    CURLOPT_TIMEOUT => 60,
                ],
                'max_request' => 10,
            ]);
        }
    }

    public function taskTopic($parser, $task)
    {
        $products = $parser->find('title');
        echo trim($products[0]->text()) . PHP_EOL;
    }
}

$bot = new HackerNewCrawler();
$bot->debug = true;
$bot->setCurlSetting([
    CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
]);
//$bot->loadProxy(__DIR__ . '/proxy_list.txt');
$bot->run();

Simple DI from change parser

    $parser = new \DiDom\Document();
    $bot = new HackerNewCrawler([$parser, 'load']);

    $bot = new HackerNewCrawler(function ($content) {
        $parser = new \DiDom\Document();
        return $parser->load($content);
    });
    
    $bot = new HackerNewCrawler(function ($content) {
        return simplexml_load_string($content);
    });
    
    $bot = new HackerNewCrawler(function ($content) {
        return new \SoapClient($content);
    });