Heim >Backend-Entwicklung >PHP-Tutorial >自己写的一个php基于phpQuery的通用采集类

自己写的一个php基于phpQuery的通用采集类

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOriginal: 2016-07-25 08:50:151221Durchsuche

还是小菜，第一次分享代码哈，这是自己以前写的一个php的采集类，自己一直在用，自我感觉很简单很强大，只要懂一点点选择器的知识就可以采集任何页面了，也支持https页面，做简单的采集足够用了。

/**
*通用列表采集类
*版本V1.3
*作者:JAE
*博客:http://blog.jaekj.com
*/
require_once '../phpQuery/phpQuery/phpQuery.php';
class QueryList{
private $pageURL;
private $regArr = array();
public $jsonArr = array();
private $regRange;
private $html;
/************************************************
* 参数: 页面地址选择器数组块选择器
* 【选择器数组】说明：格式array("名称"=>array("选择器","类型"),.......)
* 【类型】说明：值 "text" ,"html" ,"属性"
*【块选择器】：指先按照规则选出几个大块，然后再分别再在块里面进行相关的选择
*************************************************/
function QueryList($pageURL,$regArr=array(),$regRange='')
{
$this->pageURL = $pageURL;
//为了能获取https://
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$this->pageURL);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$this->html = curl_exec($ch);
curl_close($ch);
if(!empty($regArr))
{
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->getList();
}
}
function setQuery($regArr,$regRange='')
{
$this->jsonArr=array();
$this->regArr = $regArr;
$this->regRange = $regRange;
$this->getList();
}
private function getList()
{
$hobj = phpQuery::newDocumentHTML($this->html);
if(!empty($this->regRange))
{
$robj = pq($hobj)->find($this->regRange);
$i=0;
foreach($robj as $item)
{
while(list($key,$reg_value)=each($this->regArr))
{
$iobj = pq($item)->find($reg_value[0]);
switch($reg_value[1])
{
case 'text':
$this->jsonArr[$i][$key] = trim(pq($iobj)->text());
break;
case 'html':
$this->jsonArr[$i][$key] = trim(pq($iobj)->html());
break;
default:
$this->jsonArr[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
}
//重置数组指针
reset($this->regArr);
$i++;
}
}
else
{
while(list($key,$reg_value)=each($this->regArr))
{
$lobj = pq($hobj)->find($reg_value[0]);
$i=0;
foreach($lobj as $item)
{
switch($reg_value[1])
{
case 'text':
$this->jsonArr[$i++][$key] = trim(pq($item)->text());
break;
case 'html':
$this->jsonArr[$i++][$key] = trim(pq($item)->html());
break;
default:
$this->jsonArr[$i++][$key] = pq($item)->attr($reg_value[1]);
break;
}
}
}
}
}
function getJSON()
{
return json_encode($this->jsonArr);
}
}

复制代码

require 'Query/QueryList.class.php';
//采集OSC的代码分享列表，标题链接作者
$url = "http://www.oschina.net/code/list";
$reg = array("title"=>array(".code_title a:eq(0)","text"),"url"=>array(".code_title a:eq(0)","href"),"author"=>array("img","title"));
$rang = ".code_list li";
$hj = new QueryList($url,$reg,$rang);
$arr = $hj->jsonArr;
print_r($arr);
//如果还想采当前页面右边的 TOP40活跃贡献者图像，得到JSON数据,可以这样写
$reg = array("portrait"=>array(".hot_top img","src"));
$hj->setQuery($reg);
$json = $hj->getJSON();
echo $json . "
";
//采OSC内容页内容
$url = "http://www.oschina.net/code/snippet_186288_23816";
$reg = array("title"=>array(".QTitle h1","text"),"con"=>array(".Content","html"));
$hj = new QueryList($url,$reg);
$arr = $hj->jsonArr;
print_r($arr);
//就举这么多例子吧，是不是用来做采集很方便

复制代码

/**
*自己写的百度和谷歌搜索API
*版本V2.0
*作者:JAE
*博客:http://blog.jaekj.com
**/
require_once 'QueryList_class.php';
class Searcher
{
private $searcher;
private $key;
private $num;
private $page;
private $regArr ;
private $regRange ;
private $regZnum;
public $jsonArr;
//参数搜索引擎搜索关键字返回的结果条数第几页
function Searcher($searcher,$key,$num,$page)
{
if($searcher=='baidu')
{
$this->regArr = array("title"=>array("h3.t a,#ting_singlesong_box a","text"),"tCon"=>array("div.c-abstract,font:slice(0,2),div#weibo,table tr:eq(0),div.c-abstract-size p:eq(0),div.vd_sitcom_new_tinfo","text"),"url"=>array("h3.t a,#ting_singlesong_box a","href"));
$this->regRange = 'table.result,table.result-op';
$this->regZnum=array("zNum"=>array("span.nums","text"));
}
else if($searcher=='google')
{
$this->regArr = array("title"=>array("h3.r a","text"),"tCon"=>array("span.st","text"),"url"=>array("h3.r a","href"));
$this->regRange = 'li.g';
$this->regZnum=array("zNum"=>array("div#resultStats","text"));
}
$this->searcher = $searcher;
$this->key = $key;
$this->num = $num;
$this->page = $page-1;
$this->getList();
}
private function getList()
{
$s = urlencode($this->key);
$num = $this->num;
$start = $this->num*$this->page;
if($this->searcher=='baidu')
{
$url = "http://www.baidu.com/s?pn=$start&rn=$num&wd=$s";
$reg_znum='/[\d,]+/';
}
else if($this->searcher=='google')
{
$url="https://www.google.com.hk/search?filter=0&lr=&newwindow=1&safe=images&hl=en&as_qdr=all&num=$num&start=$start&q=$s";
$reg_znum='/([\d,]+) result(s)?/';
}
$searcherObj = new QueryList($url,$this->regArr,$this->regRange);
for($i=0;$ijsonArr);$i++)
{
if($this->searcher=='baidu')
{
$searcherObj->jsonArr[$i]['url'] = $this->getBaiduRealURL($searcherObj->jsonArr[$i]['url']);
}
else if($this->searcher=='google')
{
$searcherObj->jsonArr[$i]['url'] = $this->getGoogleRealURL($searcherObj->jsonArr[$i]['url']);
}
}
$this->jsonArr = $searcherObj->jsonArr ;
//获取总共结果条数
$searcherObj->setQuery($this->regZnum);
$zNum = $searcherObj->jsonArr[0]['zNum'];
preg_match($reg_znum,$zNum,$arr)?$zNum=$arr[0]:$zNum=0;
$zNum = (int)str_replace(',','',$zNum);
//计算总页数
$zPage = ceil($zNum/$this->num);
$this->jsonArr=array('num'=>$this->num,'page'=>((int)$this->page+1),'zNum'=>$zNum,'zPage'=>$zPage,"s"=>"$this->key",'other'=>array('author'=>'JAE','QQ'=>'734708094','blog'=>'http://blog.jaekj.com'),'data'=>$this->jsonArr);
}
function getJSON()
{
return json_encode($this->jsonArr);
}
private function getBaiduRealURL($url)
{
//得到百度跳转的真正地址
$header = get_headers($url,1);
if (strpos($header[0],'301') || strpos($header[0],'302'))
{
if(is_array($header['Location']))
{
//return $header['Location'][count($header['Location'])-1];
return $header['Location'][0];
}
else
{
return $header['Location'];
}
}
else
{
return $url;
}
}
private function getGoogleRealURL($url)
{
$reg_url = '/q=(.+)&/U';
return preg_match($reg_url,$url,$arr)?urldecode($arr[1]):$url;
}
}
// $hj = new Searcher('google','oschina',20,2);
// print_r( $hj->jsonArr);
//效果演示地址
//http://blog.jaekj.com//jae/demo/searcher/Searcher_class.php?searcher=baidu&s=jaekj&num=20&page=1

复制代码

Stellungnahme：

Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn

Vorheriger Artikel：后台隔5分钟发送email，email内容为html Nächster Artikel：虚拟主机MySQL数据库备份

In Verbindung stehende Artikel

Mehr sehen