Rumah >php教程 >PHP源码 >PHP采集www.php.cn的文章,并存入数据库。

PHP采集www.php.cn的文章,并存入数据库。

PHP中文网
PHP中文网asal
2016-05-25 17:01:441643semak imbas

[PHP]代码  

<?php

class Fork36kr
{
    private $start;
    private $end;
    private $number = 0;
    private $dsn;
    private $user;
    private $password;
    private $pdo;

    /**
     * @param int $start    采集起点文章id
     * @param int $end   采集终点文章id
     * @param string $dsn   PDO数据源
     * @param string $user  数据库用户名
     * @param string $password  数据库密码
     */
    public function __construct($start=200100, $end=206670,$dsn=&#39;&#39;,$user=&#39;&#39;,$password=&#39;&#39;)
    {
        $this->start = $start;
        $this->end = $end;
        $this->dsn = $dsn;
        $this->user = $user;
        $this->password = $password;
        if($dsn)
        {
            $this->pdo = new PDO($this->dsn,$this->user,$this->password);
        }
    }

    public function fork()
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

        echo "<<<<< OK. Start Fork 36kr >>>>>\n";
        for($i=$this->start;$i<=$this->end;$i++)
        {
            $url = "http://www.36kr.com/p/".$i.".html";
            curl_setopt($ch, CURLOPT_URL, $url);
            $page = curl_exec($ch);

            if(curl_getinfo($ch)[&#39;http_code&#39;]==200)
            {
                $t = preg_match(&#39;#<h1 class="entry-title sep10">.*</h1>#&#39;, $page, $title);
                $c = preg_match(&#39;#<p class="mainContent sep-10">.*</p>#Us&#39;, $page, $content);
                if($t&&$c)
                {
                    $title = strip_tags($title[0]);
                    $content = strip_tags($content[0]);
                    //$content = strip_tags($content[0],&#39;<p><a>&#39;);  //保留<p>和<a>标记
                    echo $url.&#39;,&#39;.$title."\n";
                    $this->number++;
                }
                if($this->dsn)
                {
                    $this->save($title,$content,$url);
                }
            }
        }
        echo &#39;<<<< Fork Over! Total: &#39;.$this->number.&#39;  >>>>&#39;;
    }

    private function save($title,$content,$url)
    {
        $sql = "INSERT INTO `36kr` (`id`,`title`,`content`,`url`) VALUES (null,:title,:content,:url)";
        $stmt = $this->pdo->prepare($sql);
        $stmt->bindParam(&#39;:title&#39;,$title);
        $stmt->bindParam(&#39;:content&#39;,$content);
        $stmt->bindParam(&#39;:url&#39;,$url);
        $stmt->execute();
    }
}

$dsn = &#39;mysql:host=localhost;dbname=test&#39;;
$user = &#39;root&#39;;
$password= &#39;root&#39;;

$kr = new Fork36kr(200100,206670,$dsn,$user,$password);
$kr->fork();

[文件]  phpcn.sql                              

-- phpMyAdmin SQL Dump
-- version 4.0.5
-- http://www.php.cn/
--
-- 主机: localhost
-- 生成日期: 2013 �?10 �?03 �?00:36
-- 服务器版本: 5.6.12-log
-- PHP 版本: 5.5.3

SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";


/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;

--
-- 数据库: `test`
--

-- --------------------------------------------------------

--
-- 表的结构 `36kr`
--

CREATE TABLE IF NOT EXISTS `36kr` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(128) NOT NULL,
  `content` text NOT NULL,
  `url` varchar(128) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;

/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

                               

                   

Kenyataan:
Kandungan artikel ini disumbangkan secara sukarela oleh netizen, dan hak cipta adalah milik pengarang asal. Laman web ini tidak memikul tanggungjawab undang-undang yang sepadan. Jika anda menemui sebarang kandungan yang disyaki plagiarisme atau pelanggaran, sila hubungi admin@php.cn