coderden/page-parser

Powerful PHP package for parsing HTML pages with DOM, XPath and CSS selector support

Installs: 1

Dependents: 0

Suggesters: 0

Security: 0

Stars: 0

Watchers: 0

Forks: 0

Open Issues: 0

pkg:composer/coderden/page-parser

1.0.0 2026-01-15 19:29 UTC

This package is auto-updated.

Last update: 2026-01-15 19:53:00 UTC


README

PHP Version License

A powerful PHP package for parsing HTML pages with DOM, XPath, and CSS selector support. Perfect for web scraping, data extraction, and automation tasks.

Features

  • 🔍 XPath and CSS selector support for precise element targeting
  • 🌐 HTTP client integration with Guzzle
  • 🔗 Automatic URL resolution for relative links and images
  • 📦 Multiple extraction methods for text, attributes, and HTML
  • 🔄 Meta tag parsing and SEO data extraction
  • 🚀 Easy-to-use fluent interface
  • 📊 Response handling with status codes and headers
  • 🛡️ Error handling and exception management

Installation

composer require coderden/page-parser

Quick Start

use CoderDen\PageParser\PageParser;

// Create parser instance
$parser = new PageParser();

// Load and parse a page
$parser->loadPage('https://example.com');

// Get page title
echo $parser->getTitle();

// Extract all links
$links = $parser->getAllLinks();

// Extract specific elements
$products = $parser->extractByXPath('//div[@class="product"]', [
    'name' => './/h3/text()',
    'price' => './/span[@class="price"]/text()',
    'url' => './/a/@href',
]);

Basic Usage

Using PageParser Directly

$parser = new PageParser([
    'timeout' => 30,
    'headers' => [
        'User-Agent' => 'MyBot/1.0',
    ],
]);

// Load page
$parser->loadPage('https://example.com');

// Extract by XPath
$data = $parser->extractByXPath('//article', [
    'title' => './/h2/text()',
    'content' => './/p/text()',
]);

// Extract by CSS selector
$links = $parser->extractByCss('a.article-link', ['href', '_text']);

// Check element existence
if ($parser->exists('.pagination')) {
    echo 'Pagination found!';
}

// Get element count
$imageCount = $parser->count('img');

Using ParserHelper

use CoderDen\PageParser\ParserHelper;

// Quick extraction
$links = ParserHelper::extractLinks('https://example.com');

// Get page title
$title = ParserHelper::getTitle('https://example.com');

// Extract specific data
$products = ParserHelper::extract(
    'https://example.com/products',
    '//div[@class="product-item"]',
    ['name' => './/h3/text()', 'price' => './/span[@class="price"]/text()']
);

// Check URL availability
if (ParserHelper::checkUrl('https://example.com')) {
    echo 'URL is accessible';
}

Advanced Features

Meta Data Extraction

$parser = new PageParser();
$parser->loadPage('https://example.com');

// Get meta tags
$metaTags = $parser->getMetaTags();

// Get canonical URL
$canonical = $parser->getCanonicalUrl();

// Get page charset
$charset = $parser->getCharset();

// Get Open Graph data
$ogTitle = $parser->getAttribute('meta[property="og:title"]', 'content');
$ogImage = $parser->getAttribute('meta[property="og:image"]', 'content');

URL Resolution

$parser = new PageParser();
$parser->loadPage('https://example.com/blog');

// All links are automatically resolved to absolute URLs
$links = $parser->extractLinksByXPath('//a[@href]');

// Images with relative paths become absolute
$images = $parser->extractImagesByXPath('//img[@src]');

Regular Expression Search

$parser = new PageParser();
$parser->loadPage('https://example.com');

// Search for email addresses
$emails = $parser->searchByRegex('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/');

// Search for phone numbers
$phones = $parser->searchByRegex('/\+?[\d\s\-\(\)]{7,}/');

Configuration Options

$parser = new PageParser([
    // HTTP client options
    'timeout' => 30,
    'connect_timeout' => 10,
    'verify' => true, // SSL verification
    'allow_redirects' => true,
    
    // Custom headers
    'headers' => [
        'User-Agent' => 'MyCrawler/1.0',
        'Accept' => 'text/html,application/xhtml+xml',
        'Accept-Language' => 'en-US,en;q=0.9',
        'Referer' => 'https://google.com',
    ],
    
    // Proxy support
    'proxy' => 'http://proxy.example.com:8080',
    
    // Authentication
    'auth' => ['username', 'password'],
    
    // Cookies
    'cookies' => true,
]);

Error Handling

use CoderDen\PageParser\PageParser;

try {
    $parser = new PageParser();
    $parser->loadPage('https://example.com');
    
    // Your parsing logic here
    
} catch (\RuntimeException $e) {
    echo "Failed to load page: " . $e->getMessage();
    
} catch (\Exception $e) {
    echo "General error: " . $e->getMessage();
}

Examples

Example 1: Scrape Product List

$parser = new PageParser();
$parser->loadPage('https://example.com/products');

$products = $parser->extractByXPath('//div[contains(@class, "product")]', [
    'name' => './/h3/text()',
    'price' => './/span[@class="price"]/text()',
    'sku' => './/span[@class="sku"]/text()',
    'image' => './/img/@src',
    'url' => './/a/@href',
]);

foreach ($products as $product) {
    echo "Product: {$product['name']}\n";
    echo "Price: {$product['price']}\n";
    echo "Image: {$product['image']}\n";
    echo "---\n";
}

Example 2: Extract Article Data

$articleData = ParserHelper::extract(
    'https://example.com/article',
    '//article',
    [
        'title' => './/h1/text()',
        'author' => './/span[@class="author"]/text()',
        'date' => './/time/@datetime',
        'content' => './/div[@class="content"]//p//text()',
        'tags' => './/a[@rel="tag"]//text()',
    ]
);

// Process article content
if (!empty($articleData[0]['content'])) {
    $content = is_array($articleData[0]['content']) 
        ? implode("\n", $articleData[0]['content'])
        : $articleData[0]['content'];
}

Example 3: Batch Processing URLs

$urls = [
    'https://example.com/page1',
    'https://example.com/page2',
    'https://example.com/page3',
];

$allData = [];
foreach ($urls as $url) {
    try {
        $data = ParserHelper::extract($url, '//h1', ['_text']);
        $allData[$url] = $data[0] ?? 'No title';
    } catch (\Exception $e) {
        $allData[$url] = "Error: " . $e->getMessage();
    }
}

// Save results
file_put_contents('results.json', json_encode($allData, JSON_PRETTY_PRINT));

Example 4: Monitor Website Changes

class WebsiteMonitor
{
    private PageParser $parser;
    
    public function __construct()
    {
        $this->parser = new PageParser();
    }
    
    public function checkForChanges(string $url, string $elementSelector): array
    {
        $this->parser->loadPage($url);
        
        return [
            'title' => $this->parser->getTitle(),
            'element_count' => $this->parser->count($elementSelector),
            'element_exists' => $this->parser->exists($elementSelector),
            'status_code' => $this->parser->getStatusCode(),
            'timestamp' => date('Y-m-d H:i:s'),
        ];
    }
}

$monitor = new WebsiteMonitor();
$changes = $monitor->checkForChanges('https://example.com', '.news-item');