Home >Backend Development >PHP Tutorial > share 一段小说的采撷代码

share 一段小说的采撷代码

WBOY
WBOYOriginal
2016-06-13 13:09:54816browse

share 一段小说的采集代码。
最近用火车头,ET采集小说,按他们的规则经常配不出来。碰到像小说520 里面的用iframe 的直接挂掉,只有自己写了一个,刚开始觉得就两正则解决[列表,内容]的事,写着写着便复杂起来了。
好好的改了几版,碰到最大的麻烦,,如何封装代码,就是采不同站时改动成本较小。这里小小的用了一个决策者模式。然后把该封装的功能一封。以及再次采集时,对已采集的章节的跳过机制,避免重采[毕竟一个小说站好几w篇文章,中断一次,接不上去,是很郁闷的事]

PHP code
<!--

Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/

-->

class grep extends Controller {
    var $tableName = 'grep';
    var $pagesize =31;
    var $order_string = "grep_order desc,grep_id desc";
    var $filter_field = "grep_title";
    var $check_repeat_field = "grep_title";
     var $buttons = array(
     );
     var $description = "[爬取小说]";
    


function index()
{
    //get the story list    
    $story_model = "story_model";
    $this->load->model($story_model);
    $where = array("story_id $story_model->get($where);
        
    foreach ($rows_story as $key=>$val_story):
    if($key storycate_vtitle."/".$val_story->story_vtitle;
        $src_content = file_get_contents($url);
        $src_content = iconv("GBK","utf-8//IGNORE",$src_content);
                
            $src_content = str_replace("/style=\"border-width:0px\s*1px\s*1px\s*0px;border-color:#C8D8B8;border-style:solid;padding:3px;float:left;width:313px;\"/i","",$src_content);
            $src_content = str_replace("style=\"BORDER-RIGHT: #c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: #c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: #c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: #c8d8b8 1px solid\"","",$src_content);
            
              $src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
              $src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);
            $src_content = preg_replace("/
  • ]*>/iU","",$src_content); $src_content = preg_replace("/]*>/iU","",$src_content); $src_content = preg_replace("//iU",$src_content,$arr_dstorycate); $dstorycate_arr = $arr_dstorycate[1]; foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate) { preg_match_all("/([^/i",$val_dstory_cate,$dcate_title); $datacate["dstorycate_pid"] = $val_story->story_id; $datacate["dstorycate_title"] = $dcate_title[1][0]; //获取类别对象,记将之前的类别标置为已下载 $dtitle =$datacate["dstorycate_title"]; $obj_storycate = $this->check_dcate($dtitle,$val_story ); //pr($obj_storycate); if($obj_storycate->dstorycate_ishot == 1) { $this->log( "已此章节已抓取完 $val_story->story_title - $dtitle ,跳过"); continue; } preg_match_all("/]*>(?!)([\d\D]*)/iU",$val_dstory_cate,$dinfo_list); $list_story_url = $dinfo_list[1]; $list_story_title = $dinfo_list[2]; $story_url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle; $this->add_storyxxs_info($obj_storycate,$list_story_url,$list_story_title,$story_url); } endforeach; } function status() { $sql = "select count(dstory_id) as all_story from dstory;"; $query = $this->db->query($sql); $cont_all = $query->row(); echo $cont_all->all_story; $sql = "select count(dstory_id) as story1 from dstory where dstory_status = 1"; $query = $this->db->query($sql); $cont_all = $query->row(); echo "--".$cont_all->story1; $sql = "select max(dstorycate_id) as max_id,max(dstorycate_pid) as max_pid from dstorycate"; $query = $this->db->query($sql); $cont_all = $query->row(); echo "--".$cont_all->max_id."--".$cont_all->max_pid; } /** * *下载445后的章节 * */ function index445() { $story_model = "story_model"; $this->load->model($story_model); $where = array("story_id > 445"); $rows_story = $this->$story_model->get($where); foreach ($rows_story as $key=>$val_story): //get the story_content $story_url_arr = explode("/",$val_story->story_url); $story_url= $story_url_arr[1]."/".$story_url_arr[2]."/".$story_url_arr[3]."/".$story_url_arr[4]; $dest_url = "http://www.xiaoshuo520.com/".$story_url; $src_content = CS_file_get_contents($dest_url); $src_content = iconv("GBK","utf-8//IGNORE",$src_content); //按类别进行分类分组数据. preg_match_all("/(
    [\d\D]*)
    $val_dstory_cate) { preg_match_all("/
    ([\d\D]*)/i",$val_dstory_cate,$dcate_title); $datacate["dstorycate_pid"] = $val_story->story_id; $datacate["dstorycate_title"] = $dcate_title[1][0]; //获取类别对象,记将之前的类别标置为已下载 $dtitle =$datacate["dstorycate_title"]; $obj_storycate = $this->check_dcate($dtitle,$val_story ); //pr($obj_storycate); if($obj_storycate->dstorycate_ishot == 1) { $this->log( "已此章节已抓取完 $val_story->story_title - $dtitle ,跳过"); continue; } preg_match_all("/]*>(?!)([\d\D]*)/iU",$val_dstory_cate,$dinfo_list); $list_story_url = $dinfo_list[1]; $list_story_title = $dinfo_list[2]; $this->add_story520_info($obj_storycate,$list_story_url,$list_story_title,$story_url); } endforeach; } /** * 根据章节标准 和小说对象,获取标题对象 * */ function check_dcate($title,$obj_story) { $dstorycate_model = "dstorycate_model"; $this->load->model($dstorycate_model); $where = array("dstorycate_pid = $obj_story->story_id","dstorycate_title = '$title'"); $rows = $this->$dstorycate_model->get($where); if(!$rows) { $datacate["dstorycate_pid"] = $obj_story->story_id; $datacate["dstorycate_title"] = $title; $this->$dstorycate_model->insert($datacate); $obj_cate_id = $this->db->insert_id(); $where = array("dstorycate_id = $obj_cate_id","dstorycate_title = '$title'"); $rows = $this->$dstorycate_model->get($where); $this->log( "此书没有相关类别,将进行添加 小说$obj_story->story_title - $title "); }else { $this->log( "已存在相关小说类别 $obj_story->story_title - $title ,跳过"); } $obj_cate = $rows[0]; $sql = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $obj_story->story_id && dstorycate_id dstorycate_id "; $this->db->query($sql); return $obj_cate; } /*** *添加小说对旬 */ function add_story520_info($cate_obj,$list_story_url,$list_story_title,$url) { $dstory_model = "dstory_model"; $this->load->model($dstory_model); $min_key = intval($cate_obj->dstorycate_pvcount); if(!$min_key) $min_key = 0 ; foreach($list_story_url as $key=>$val): if($key check_dstory($cate_obj,"http://www.xiaoshuo520.com/".$url."/".$val,$list_story_title[$key],"grep_520_info"); endforeach; } function add_storyxxs_info($cate_obj,$list_story_url,$list_story_title,$url) { $dstory_model = "dstory_model"; $this->load->model($dstory_model); $min_key = intval($cate_obj->dstorycate_pvcount); if(!$min_key) $min_key = 0 ; foreach($list_story_url as $key=>$val): if($key log("$cate_obj->dstorycate_id 号 $cate_obj->dstorycate_title ".$list_story_title[$key]." 章 $key check_dstory($cate_obj,$url."/".$val,$list_story_title[$key],"grep_xxs_info"); endforeach; }
  • Statement:
    The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn