首頁  >  文章  >  後端開發  >  Goutte怎麼取得a標籤裡面的url,或好用的PHP爬蟲庫,謝謝

Goutte怎麼取得a標籤裡面的url,或好用的PHP爬蟲庫,謝謝

WBOY
WBOY原創
2016-08-08 09:06:351262瀏覽

Goutte怎麼取得a標籤裡面的url?或好用的PHP爬蟲庫,謝謝

<code><?php
require('./Vendor/autoload.php');
use Goutte\Client;

/**
* 
*/
class Spider 
{
    private $_client;
    private $_crawler;
    public  $_news = [
        'title'   => [],
        'link'    => [],
        'content' => [],
        'source'  => [],
        'date'    => [],
    ];

    public function __construct()
    {
        try {
            $this->_client  = new Client();
            $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx');
            // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10);
        } catch (Exception $e) {
            throw new \Exception($e->getMessage(), 1);
        }
    }

    public function getDate()
    {
        $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) {
           $this->_news['date'][] = $node->text();
        });
    }

    public function getTitle()
    {
        $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link();
        var_dump($link->getUri);die;
        $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) {
           if ($node->text() !== '宁陕要闻') {
                $this->_news['title'][]  = $node->text();
                $this->_news['link'][]   = $node->link();
                $this->_news['source'][] = '宁陕要闻';
           }
        });
    }
}

//-----------------------------------
try {
    $spider = new Spider();
    $spider->getDate();
    $spider->getTitle();

    echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE);
} catch (Exception $e) {
    echo $e->getMessage();
}

</code>

回覆內容:

Goutte怎麼取得a標籤裡面的url?或好用的PHP爬蟲庫,謝謝

<code><?php
require('./Vendor/autoload.php');
use Goutte\Client;

/**
* 
*/
class Spider 
{
    private $_client;
    private $_crawler;
    public  $_news = [
        'title'   => [],
        'link'    => [],
        'content' => [],
        'source'  => [],
        'date'    => [],
    ];

    public function __construct()
    {
        try {
            $this->_client  = new Client();
            $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx');
            // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10);
        } catch (Exception $e) {
            throw new \Exception($e->getMessage(), 1);
        }
    }

    public function getDate()
    {
        $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) {
           $this->_news['date'][] = $node->text();
        });
    }

    public function getTitle()
    {
        $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link();
        var_dump($link->getUri);die;
        $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) {
           if ($node->text() !== '宁陕要闻') {
                $this->_news['title'][]  = $node->text();
                $this->_news['link'][]   = $node->link();
                $this->_news['source'][] = '宁陕要闻';
           }
        });
    }
}

//-----------------------------------
try {
    $spider = new Spider();
    $spider->getDate();
    $spider->getTitle();

    echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE);
} catch (Exception $e) {
    echo $e->getMessage();
}

</code>

現找的

<code>$crawler = $client->request('GET', 'http://www.symfony.com/blog/');
$link = $crawler->selectLink('Security Advisories')->link();
print_r($link->getUri());</code>

手冊:http://symfony.com/doc/curren...
GIT:https://github.com/FriendsOfP...

採集類參考:http://flc.ren/2016/06/528.html

陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn