ixnode / php-web-crawler
PHP Web Crawler - This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it.
0.1.24
2024-02-28 01:18 UTC
Requires
- php: ^8.2
- ext-curl: *
- ext-dom: *
- ext-gd: *
- ext-libxml: *
- adhocore/cli: ^v1.0.0
- ixnode/php-container: ^0.1.22
Requires (Dev)
- friendsofphp/php-cs-fixer: ^3.13
- ixnode/bash-version-manager: ^0.1.3
- jetbrains/phpstorm-attributes: ^1.0
- phpmd/phpmd: ^2.13
- phpstan/phpstan: ^1.9
- phpunit/phpunit: ^9.5
- povils/phpmnd: ^3.0
- rector/rector: ^0.15.1
README
This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it. Simply define the url (or a html file) and a set of xpath expressions which should map with the output data object. The final representation will be a php array which can be easily converted into the json format for further processing.
1. Installation
composer require ixnode/php-web-crawler
vendor/bin/php-web-crawler -V
php-web-crawler 0.1.0 (02-24-2024 14:46:26) - Björn Hempel <bjoern@hempel.li>
2. Usage
2.1 PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Value\Text; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <h1>Test Title</h1> <p>Test Paragraph</p> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('version', new Text('1.0.0')), new Field('title', new XpathTextNode('//h1')), new Field('paragraph', new XpathTextNode('//p')) ); $html->parse()->getJsonStringFormatted(); // See below
2.2 JSON result
{ "version": "1.0.0", "title": "Test Title", "paragraph": "Test Paragraph" }
3. Advanced usage
3.1 Group
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'content', new Group( 'header', new Field('h1', new XpathTextNode('/html/body//h1')), ), new Group( 'text', new Field('p1', new XpathTextNode('/html/body//p[@class="paragraph-1"]')), new Field('p2', new XpathTextNode('/html/body//p[@class="paragraph-2"]')), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{ "title": "Test Page", "content": { "header": { "h1": "Test Title" }, "text": { "p1": "Test Paragraph 1", "p2": "Test Paragraph 2" } } }
3.2 XpathSection
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSection; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'content', new XpathSection( '/html/body//div[@class="content"]', new Group( 'header', new Field('h1', new XpathTextNode('./h1')), ), new Group( 'text', new Field('p1', new XpathTextNode('./p[@class="paragraph-1"]')), new Field('p2', new XpathTextNode('./p[@class="paragraph-2"]')), ) ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{ "title": "Test Page", "content": { "header": { "h1": "Test Title" }, "text": { "p1": "Test Paragraph 1", "p2": "Test Paragraph 2" } } }
3.3 XpathSection (flat)
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSections; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> <ul> <li>Test Item 1</li> <li>Test Item 2</li> </ul> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'hits', new XpathSections( '/html/body//div[@class="content"]/ul', new XpathTextNode('./li/text()'), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{ "title": "Test Page", "hits": [ [ "Test Item 1", "Test Item 2" ] ] }
3.3 XpathSection (structured)
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSections; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> <table> <tbody> <tr> <th>Caption 1</th> <td>Cell 1</td> </tr> <tr> <th>Caption 2</th> <td>Cell 2</td> </tr> </tbody> </table> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'hits', new XpathSections( '/html/body//div[@class="content"]/table/tbody/tr', new Field('caption', new XpathTextNode('./th/text()')), new Field('content', new XpathTextNode('./td/text()')), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{ "title": "Test Page", "hits": [ { "caption": "Caption 1", "content": "Cell 1" }, { "caption": "Caption 2", "content": "Cell 2" } ] }
4. More examples
- examples/converter.php
- examples/group.php
- examples/section.php
- examples/sections-recursive-url.php
- examples/sections.php
- examples/simple-wiki-page.php
5. Development
git clone git@github.com:ixnode/php-web-crawler.git && cd php-web-crawler
composer install
composer test
6. License
This library is licensed under the MIT License - see the LICENSE.md file for details.