首頁 >php教程 >php手册 >CURL+DOM采集小样

CURL+DOM采集小样

WBOY
WBOY原創
2016-06-07 11:39:44857瀏覽

个人感觉效率非常高的采集方式,但DOMdocument好像无法采集到带标签的内容,采集纯文本非常快.
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.//获取店铺栏目<br>     public function getShopCate($shop_url="",$continue=0){<br>         $source=file_get_contents('Offline/shops.htm');<br>         $shops_id=1;<br>         $mall_id=1;<br>         $cate=M('goods_category')->where(array('shops_id'=>$shops_id))->find();<br>         if(!empty($cate)){<br>             return fasle;//采集店铺栏目已存在<br>             //$this->error('采集店铺栏目已存在');<br>         }<br> <br>         import('@.Tao.TaoHttp','','.php');<br>         $Http= new \TaoHttp();<br>         $shop_html=$Http->encoding($source);<br> <br>         $shop_category_rule=D('CollectGoods')->getRule($mall_id,'shop_category');<br>         import('@.Tao.Dom.Query','','.php');<br>         $Dom= new \Zend_Dom_Query($shop_html);<br>         $shop_category=$Dom->query($shop_category_rule);<br>         if(count($shop_category)==0){<br>             return false;//采集不到店铺栏目<br>             //$this->error('采集不到店铺栏目');<br>         }<br>         $result=array();<br>         foreach ($shop_category as $key => $value) {<br>             $result[$key]['url']=$value->getAttribute('href');<br>             $result[$key]['name']=trim($value->nodeValue);<br>         }<br>         unset($result[0]);<br>         <br>         $data=array();<br>         $time=time();<br>         $cate_url=array();<br>         foreach ($result as $value) {<br>             $dataTmp=array(<br>                 'shops_id'=> $shops_id,<br>                 'cate_name'=> $value['name'],<br>                 'cate_url'=> $value['url'],<br>                 'collect_time'=>$time,<br>             );<br>             $cate_url[]=$value['url'];<br>             $data[]=$dataTmp;<br>         }<br>         M('goods_category')->addAll($data);<br>         return true;//采集店铺栏目成功<br>     }    //获取店铺一个栏目商品<br>     //http://localhost/TaoGoods/index.php?m=Taogoods&c=CollectGoods&a=getShopGoods&cate_id=3<br>     public function getShopGoods($cate_id=0){<br>         if($cate_id==0){return false;}<br>         $goods_time=M('goods')->where(array('cate_id'=>$cate_id))->getField('collect_time');<br> <br>         if($goods_time){<br>             if($goods_time + 86400*$this->day > time()){<br>                 $this->error('15天内请勿重复采集',U('index'));<br>             }<br>             $this->error('采集店铺栏目下货品已存在',U('index'));<br>         }<br> <br>         $cate_data=M('goods_category')->find($cate_id);<br>         $shops_id=$cate_data['shops_id'];<br>         $cate_id=$cate_data['id'];<br>         $mall_id=$cate_data['mall_id'];<br> <br>         import('@.Tao.TaoHttp','','.php');<br>         $Http= new \TaoHttp();<br>         $source=$Http->get($cate_data['cate_url']);<br>         $shop_html=$Http->encoding($source);<br> <br>         $cate_rule=D('CollectGoods')->getRule($mall_id);<br>         import('@.Tao.Dom.Query','','.php');<br>         $Dom= new \Zend_Dom_Query($shop_html);<br>         $cate_imgs=$Dom->query($cate_rule['shop_category_goods_img']);<br>         $cate_names=$Dom->query($cate_rule['shop_category_goods_name']);<br>         $cate_sales=$Dom->query($cate_rule['shop_category_goods_sale']);<br>         $cate_cprices=$Dom->query($cate_rule['shop_category_goods_cprice']);<br>         //$cate_sprices=$Dom->query($cate_rule['shop_category_goods_sprice']);<br>         <br>         $num=count($cate_names);<br> <br>         $time=time();<br>         $result=array();<br>         for ($i=0; $i              $result[$i]['goods_thumb']=$cate_imgs->bykey($i)->getAttribute('src');<br>             $result[$i]['goods_name']=$cate_names->bykey($i)->nodeValue;<br>             $result[$i]['goods_url']=$cate_names->bykey($i)->getAttribute('href');<br>             $result[$i]['goods_cprice']=$cate_cprices->bykey($i)->nodeValue;<br>             $result[$i]['goods_sale']=$cate_sales->bykey($i)->nodeValue;<br>             //$result[$i]['goods_spirce']=$cate_sprices->bykey($i)->nodeValue;<br>             $result[$i]['mall_id']=$mall_id;<br>             $result[$i]['shops_id']=$shops_id;<br>             $result[$i]['cate_id']=$cate_id;<br>             $result[$i]['collect_time']=$time;<br>         }<br> <br>         if(M('goods')->addAll($result)){<br>             $this->success('采集店铺栏目下货品成功',U('index'));<br>         }    /**<br>      * 判断网页数据,转GBK等到UTF-8<br>      */ <br>     public function encoding($source){<br>         $encode = mb_detect_encoding($source, array("GBK","UTF-8","GB2312","BIG5"));<br>         if($encode=='CP936'){<br>             $source=iconv("GBK", "UTF-8//IGNORE", $source);<br>             //$meta用于DOM判断编码<br>             $meta = '<meta>';<br>             $source=$meta.$source;<br>         }<br>         return $source;<br>     }测试:只有这两个按钮能用,其他的都不能用
CURL+DOM采集小样
CURL+DOM采集小样
测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里

BY:悠悠山雨

附件 Taogoods.zip ( 2.2 MB 下载:72 次 )

AD:真正免费,域名+虚机+企业邮箱=0元

陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn