arefshojaei/spider

PHP web spider

v2.3.0 2025-03-28 14:43 UTC

This package is auto-updated.

Last update: 2025-03-28 14:47:03 UTC


README

PHP web spider

<?php

use Spider\Spider;

$spider = new Spider;

$page = $spider->loadHTML("http://google.com");

echo $page->find("title")->text() . PHP_EOL;

$page->findAll("a")->each(function($key, $link) {
    echo "[LINK] " . $link->attr("href") . PHP_EOL;
});

Installation

Using Composer

composer create-project arefshojaei/spider

Using GIT

git clone https://github.com/ArefShojaei/Spider

Find element

  • find()
  • findAll()
$page->find("a");

$page->findAll(".product");

Iterate for each eleemnt

  • each()
  • map()
  • filter()
$page->findAll("a")->each(function($key, $anchor) {
    echo "[LINK] " . $anchor->attr("href") . PHP_EOL;
    echo "[TITLE] " . $anchor->text() . PHP_EOL;
    echo "[HTML] " . $anchor->html() . PHP_EOL;
});

# ----------------------------------------
$anchors = $page->findAll("a")->map(function($key, $anchor) {
    $anchor->attr("data-id", rand());

    return $anchor;
});

var_dump($anchors);

# ----------------------------------------
$filteredAnchors = $page->findAll("a")->filter(function($key, $anchor) => $anchor->attr("data-id")); 

var_dump($filteredAnchors);

Element traversing

  • parent()
  • after()
  • before()
  • append()
  • prepend()
$parentNode = $page->find(".product")->parent();

# Add parent Element
$page->find(".product")->after("<p>After Element</p>");
$page->find(".product")->before("<p>Before Element</p>");

# Add child (local) element
$page->find(".product")->append("<p>Append Element</p>");
$page->find(".product")->prepend("<p>Prepend Element</p>");

Element cleaner

  • empty()
  • remove()
# Clean element content
$page->find("p")->empty();

# Remove element from the DOM
$page->find("p")->remove();

Element content

  • text()
  • html()
# Getter
$text = $page->find("p")->text();
$html = $page->find("p")->html();

# Setter
$newText = $page->find("p")->text("New text content");
$newHtml = $page->find("p")->html("<p id='spider'>New html content</p>");

Element attribute

  • attr()
  • addClass()
  • removeClass()
  • hasClass()
  • addId()
  • removeId()
  • hasId()
# Getter
$attributes = $page->find("a")->attr();

$link = $page->find("a")->attr("href");

# Setter
$page->find("a")->attr("data-id", rand());

# Class
$page->find("p")->addClass("spider");
$page->find("p")->removeClass("spider");
$page->find("p")->hasClass("spider");

# ID
$page->find("p")->addID("spider");
$page->find("p")->removeID("spider");
$page->find("p")->hasID("spider");

Export current page content

$filename = "app";

$path = __DIR__ . "\\html\\" . $filename . rand() . ".html";

$page->export($path);