Home  >  Article  >  php教程  >  PHP采集www.php.cn的文章,并存入数据库。

PHP采集www.php.cn的文章,并存入数据库。

PHP中文网
PHP中文网Original
2016-05-25 17:01:441600browse

[PHP]代码  

<?php

class Fork36kr
{
    private $start;
    private $end;
    private $number = 0;
    private $dsn;
    private $user;
    private $password;
    private $pdo;

    /**
     * @param int $start    采集起点文章id
     * @param int $end   采集终点文章id
     * @param string $dsn   PDO数据源
     * @param string $user  数据库用户名
     * @param string $password  数据库密码
     */
    public function __construct($start=200100, $end=206670,$dsn=&#39;&#39;,$user=&#39;&#39;,$password=&#39;&#39;)
    {
        $this->start = $start;
        $this->end = $end;
        $this->dsn = $dsn;
        $this->user = $user;
        $this->password = $password;
        if($dsn)
        {
            $this->pdo = new PDO($this->dsn,$this->user,$this->password);
        }
    }

    public function fork()
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

        echo "<<<<< OK. Start Fork 36kr >>>>>\n";
        for($i=$this->start;$i<=$this->end;$i++)
        {
            $url = "http://www.36kr.com/p/".$i.".html";
            curl_setopt($ch, CURLOPT_URL, $url);
            $page = curl_exec($ch);

            if(curl_getinfo($ch)[&#39;http_code&#39;]==200)
            {
                $t = preg_match(&#39;#<h1 class="entry-title sep10">.*</h1>#&#39;, $page, $title);
                $c = preg_match(&#39;#<p class="mainContent sep-10">.*</p>#Us&#39;, $page, $content);
                if($t&&$c)
                {
                    $title = strip_tags($title[0]);
                    $content = strip_tags($content[0]);
                    //$content = strip_tags($content[0],&#39;<p><a>&#39;);  //保留<p>和<a>标记
                    echo $url.&#39;,&#39;.$title."\n";
                    $this->number++;
                }
                if($this->dsn)
                {
                    $this->save($title,$content,$url);
                }
            }
        }
        echo &#39;<<<< Fork Over! Total: &#39;.$this->number.&#39;  >>>>&#39;;
    }

    private function save($title,$content,$url)
    {
        $sql = "INSERT INTO `36kr` (`id`,`title`,`content`,`url`) VALUES (null,:title,:content,:url)";
        $stmt = $this->pdo->prepare($sql);
        $stmt->bindParam(&#39;:title&#39;,$title);
        $stmt->bindParam(&#39;:content&#39;,$content);
        $stmt->bindParam(&#39;:url&#39;,$url);
        $stmt->execute();
    }
}

$dsn = &#39;mysql:host=localhost;dbname=test&#39;;
$user = &#39;root&#39;;
$password= &#39;root&#39;;

$kr = new Fork36kr(200100,206670,$dsn,$user,$password);
$kr->fork();

[文件]  phpcn.sql                              

-- phpMyAdmin SQL Dump
-- version 4.0.5
-- http://www.php.cn/
--
-- 主机: localhost
-- 生成日期: 2013 �?10 �?03 �?00:36
-- 服务器版本: 5.6.12-log
-- PHP 版本: 5.5.3

SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";


/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;

--
-- 数据库: `test`
--

-- --------------------------------------------------------

--
-- 表的结构 `36kr`
--

CREATE TABLE IF NOT EXISTS `36kr` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(128) NOT NULL,
  `content` text NOT NULL,
  `url` varchar(128) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;

/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

                               

                   

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn