Rumah >pembangunan bahagian belakang >tutorial php >Goutte怎么获取a标签里面的url,或者好用的PHP爬虫库,谢谢

Goutte怎么获取a标签里面的url,或者好用的PHP爬虫库,谢谢

WBOY
WBOYasal
2016-08-08 09:06:351319semak imbas

Goutte怎么获取a标签里面的url?或者好用的PHP爬虫库,谢谢

<code><?php require('./Vendor/autoload.php');
use Goutte\Client;

/**
* 
*/
class Spider 
{
    private $_client;
    private $_crawler;
    public  $_news = [
        'title'   => [],
        'link'    => [],
        'content' => [],
        'source'  => [],
        'date'    => [],
    ];

    public function __construct()
    {
        try {
            $this->_client  = new Client();
            $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx');
            // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10);
        } catch (Exception $e) {
            throw new \Exception($e->getMessage(), 1);
        }
    }

    public function getDate()
    {
        $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) {
           $this->_news['date'][] = $node->text();
        });
    }

    public function getTitle()
    {
        $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link();
        var_dump($link->getUri);die;
        $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) {
           if ($node->text() !== '宁陕要闻') {
                $this->_news['title'][]  = $node->text();
                $this->_news['link'][]   = $node->link();
                $this->_news['source'][] = '宁陕要闻';
           }
        });
    }
}

//-----------------------------------
try {
    $spider = new Spider();
    $spider->getDate();
    $spider->getTitle();

    echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE);
} catch (Exception $e) {
    echo $e->getMessage();
}

</code>

回复内容:

Goutte怎么获取a标签里面的url?或者好用的PHP爬虫库,谢谢

<code><?php require('./Vendor/autoload.php');
use Goutte\Client;

/**
* 
*/
class Spider 
{
    private $_client;
    private $_crawler;
    public  $_news = [
        'title'   => [],
        'link'    => [],
        'content' => [],
        'source'  => [],
        'date'    => [],
    ];

    public function __construct()
    {
        try {
            $this->_client  = new Client();
            $this->_crawler = $this->_client->request('GET', 'http://www.ningshan.gov.cn/Category_90/Index.aspx');
            // $client->getClient()->setDefaultOption('config/curl/'.CURLOPT_TIMEOUT, 10);
        } catch (Exception $e) {
            throw new \Exception($e->getMessage(), 1);
        }
    }

    public function getDate()
    {
        $this->_crawler->filter('div#list>ul>li>span')->each(function ($node) {
           $this->_news['date'][] = $node->text();
        });
    }

    public function getTitle()
    {
        $link = $this->_crawler->selectLink('宁陕县召开政协八届二十二次次常委会')->link();
        var_dump($link->getUri);die;
        $this->_crawler->filter('div#list>ul>li>a')->each(function ($node) {
           if ($node->text() !== '宁陕要闻') {
                $this->_news['title'][]  = $node->text();
                $this->_news['link'][]   = $node->link();
                $this->_news['source'][] = '宁陕要闻';
           }
        });
    }
}

//-----------------------------------
try {
    $spider = new Spider();
    $spider->getDate();
    $spider->getTitle();

    echo json_encode($spider->_news, JSON_UNESCAPED_UNICODE);
} catch (Exception $e) {
    echo $e->getMessage();
}

</code>

现找的

<code>$crawler = $client->request('GET', 'http://www.symfony.com/blog/');
$link = $crawler->selectLink('Security Advisories')->link();
print_r($link->getUri());</code>

手册:http://symfony.com/doc/curren...
GIT:https://github.com/FriendsOfP...

采集类参考:http://flc.ren/2016/06/528.html

Kenyataan:
Kandungan artikel ini disumbangkan secara sukarela oleh netizen, dan hak cipta adalah milik pengarang asal. Laman web ini tidak memikul tanggungjawab undang-undang yang sepadan. Jika anda menemui sebarang kandungan yang disyaki plagiarisme atau pelanggaran, sila hubungi admin@php.cn