c4pone/yolo_crawler

An event based domain crawler

dev-master 2015-03-02 04:05 UTC

This package is not auto-updated.

Last update: 2024-04-13 14:23:51 UTC


README

#yolo crawler

Status Label Status Value
Build Build Status
Code Quality Scrutinizer Code Quality

find broken links example

require 'bootstrap/autoload.php';

use WP\Crawler\LinkFinder;
use WP\Crawler\DomainCrawler;
use WP\Crawler\Queue\QueueManager;
use WP\Crawler\Queue\ArrayQueue;
use WP\Crawler\Queue\Store\ArrayStore;
use WP\Crawler\Queue\Validator\ValidFileExtension;
use WP\Crawler\Queue\Validator\NoPseudoUrl;
use WP\Crawler\Event\LogSubscriber;
use WP\Crawler\Event\BrokenLinkFinderSubscriber;
use Symfony\Component\EventDispatcher\EventDispatcher;

if (isset($argv[1])) {
    $domain = $argv[1];

    $manager = new QueueManager(new ArrayQueue(), new ArrayStore());
    $manager->addValidator(new NoPseudoUrl())
        ->addValidator(new ValidFileExtension());

    $crawler = new DomainCrawler(
        $manager,
        new LinkFinder()
    );

    if (isset($argv[2]))
        $crawler->setWaitTime($argv[2]);

    $dispatcher = $crawler->getEventDispatcher();
    $dispatcher->addSubscriber(new LogSubscriber);
    $dispatcher->addSubscriber(new BrokenLinkFinderSubscriber);

    $crawler->crawl($domain);

} else {
    echo "\n";
    echo ("Usage " . $argv[0] . ' {domain} {time to wait}' . "\n");
}