个人感觉效率非常高的采集方式,但DOMdocument好像无法采集到带标签的内容,采集纯文本非常快.
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.//获取店铺栏目<br>
public function getShopCate($shop_url="",$continue=0){<br>
$source=file_get_contents('Offline/shops.htm');<br>
$shops_id=1;<br>
$mall_id=1;<br>
$cate=M('goods_category')->where(array('shops_id'=>$shops_id))->find();<br>
if(!empty($cate)){<br>
return fasle;//采集店铺栏目已存在<br>
//$this->error('采集店铺栏目已存在');<br>
}<br>
<br>
import('@.Tao.TaoHttp','','.php');<br>
$Http= new \TaoHttp();<br>
$shop_html=$Http->encoding($source);<br>
<br>
$shop_category_rule=D('CollectGoods')->getRule($mall_id,'shop_category');<br>
import('@.Tao.Dom.Query','','.php');<br>
$Dom= new \Zend_Dom_Query($shop_html);<br>
$shop_category=$Dom->query($shop_category_rule);<br>
if(count($shop_category)==0){<br>
return false;//采集不到店铺栏目<br>
//$this->error('采集不到店铺栏目');<br>
}<br>
$result=array();<br>
foreach ($shop_category as $key => $value) {<br>
$result[$key]['url']=$value->getAttribute('href');<br>
$result[$key]['name']=trim($value->nodeValue);<br>
}<br>
unset($result[0]);<br>
<br>
$data=array();<br>
$time=time();<br>
$cate_url=array();<br>
foreach ($result as $value) {<br>
$dataTmp=array(<br>
'shops_id'=> $shops_id,<br>
'cate_name'=> $value['name'],<br>
'cate_url'=> $value['url'],<br>
'collect_time'=>$time,<br>
);<br>
$cate_url[]=$value['url'];<br>
$data[]=$dataTmp;<br>
}<br>
M('goods_category')->addAll($data);<br>
return true;//采集店铺栏目成功<br>
}
//获取店铺一个栏目商品<br>
//http://localhost/TaoGoods/index.php?m=Taogoods&c=CollectGoods&a=getShopGoods&cate_id=3<br>
public function getShopGoods($cate_id=0){<br>
if($cate_id==0){return false;}<br>
$goods_time=M('goods')->where(array('cate_id'=>$cate_id))->getField('collect_time');<br>
<br>
if($goods_time){<br>
if($goods_time + 86400*$this->day > time()){<br>
$this->error('15天内请勿重复采集',U('index'));<br>
}<br>
$this->error('采集店铺栏目下货品已存在',U('index'));<br>
}<br>
<br>
$cate_data=M('goods_category')->find($cate_id);<br>
$shops_id=$cate_data['shops_id'];<br>
$cate_id=$cate_data['id'];<br>
$mall_id=$cate_data['mall_id'];<br>
<br>
import('@.Tao.TaoHttp','','.php');<br>
$Http= new \TaoHttp();<br>
$source=$Http->get($cate_data['cate_url']);<br>
$shop_html=$Http->encoding($source);<br>
<br>
$cate_rule=D('CollectGoods')->getRule($mall_id);<br>
import('@.Tao.Dom.Query','','.php');<br>
$Dom= new \Zend_Dom_Query($shop_html);<br>
$cate_imgs=$Dom->query($cate_rule['shop_category_goods_img']);<br>
$cate_names=$Dom->query($cate_rule['shop_category_goods_name']);<br>
$cate_sales=$Dom->query($cate_rule['shop_category_goods_sale']);<br>
$cate_cprices=$Dom->query($cate_rule['shop_category_goods_cprice']);<br>
//$cate_sprices=$Dom->query($cate_rule['shop_category_goods_sprice']);<br>
<br>
$num=count($cate_names);<br>
<br>
$time=time();<br>
$result=array();<br>
for ($i=0; $i
$result[$i]['goods_thumb']=$cate_imgs->bykey($i)->getAttribute('src');<br>
$result[$i]['goods_name']=$cate_names->bykey($i)->nodeValue;<br>
$result[$i]['goods_url']=$cate_names->bykey($i)->getAttribute('href');<br>
$result[$i]['goods_cprice']=$cate_cprices->bykey($i)->nodeValue;<br>
$result[$i]['goods_sale']=$cate_sales->bykey($i)->nodeValue;<br>
//$result[$i]['goods_spirce']=$cate_sprices->bykey($i)->nodeValue;<br>
$result[$i]['mall_id']=$mall_id;<br>
$result[$i]['shops_id']=$shops_id;<br>
$result[$i]['cate_id']=$cate_id;<br>
$result[$i]['collect_time']=$time;<br>
}<br>
<br>
if(M('goods')->addAll($result)){<br>
$this->success('采集店铺栏目下货品成功',U('index'));<br>
}
/**<br>
* 判断网页数据,转GBK等到UTF-8<br>
*/ <br>
public function encoding($source){<br>
$encode = mb_detect_encoding($source, array("GBK","UTF-8","GB2312","BIG5"));<br>
if($encode=='CP936'){<br>
$source=iconv("GBK", "UTF-8//IGNORE", $source);<br>
//$meta用于DOM判断编码<br>
$meta = '<meta>';<br>
$source=$meta.$source;<br>
}<br>
return $source;<br>
}
测试:只有这两个按钮能用,其他的都不能用
测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里
BY:悠悠山雨
Taogoods.zip ( 2.2 MB 下载:72 次 )
AD:真正免费,域名+虚机+企业邮箱=0元