Maison  >  Article  >  développement back-end  >  爬虫_电影ftp下载地址

爬虫_电影ftp下载地址

WBOY
WBOYoriginal
2016-08-08 09:21:163383parcourir

站点:http://www.dy2018.com/

数据库:mysql 账号:root 密码:123456

建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));

代码:

<?php declare(ticks = 1);
	pcntl_signal(SIGQUIT, &#39;signal_handler&#39;);
	pcntl_signal(SIGTERM, &#39;signal_handler&#39;);

	$crawlers_pid = array();
	$finish_count = 0;

	//信号处理函数
	function signal_handler($signal) 
	{
	    global $crawlers_pid;
	    if ($signal == SIGQUIT || $signal == SIGTERM) 
	    {
	        foreach ($crawlers_pid as $pid) {
	            posix_kill($pid,SIGTERM);
	        }
	        echo "---------- crawl task exit ----------";
	        global $con;//mysql
	        exit();
	    }
	}

	//GET方式获取链接对应页面内容
	function get_page_content($url) 
	{
		$content = file_get_contents($url);
		return $content;
	}

	//POST方式获取链接对应页面内容
	function get_page_content_by_post($url, $arr)
	{
		$arr = http_build_query($arr);
		$opts = array (
			&#39;http&#39; => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)
		);
		$context = stream_context_create($opts);
		$content = file_get_contents($url,false,$context);
		return $content;
	}

	//dy2018抓取主流程
	function run_dy2018() 
	{
		global $crawlers_pid;
		global $finish_count;
		$crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",
		"http://www.dy2018.com/html/tv/hepai/",
		"http://www.dy2018.com/html/tv/gangtai/",
		"http://www.dy2018.com/html/tv/oumeitv/",
		"http://www.dy2018.com/html/tv/rihantv/",
		"http://www.dy2018.com/html/tv/tvzz/",
		"http://www.dy2018.com/0/",
		"http://www.dy2018.com/1/",
		"http://www.dy2018.com/2/",
		"http://www.dy2018.com/3/",
		"http://www.dy2018.com/4/",
		"http://www.dy2018.com/5/",
		"http://www.dy2018.com/6/",
		"http://www.dy2018.com/7/",
		"http://www.dy2018.com/8/",
		"http://www.dy2018.com/9/",
		"http://www.dy2018.com/10/",
		"http://www.dy2018.com/11/",
		"http://www.dy2018.com/12/",
		"http://www.dy2018.com/13/",
		"http://www.dy2018.com/14/",
		"http://www.dy2018.com/15/",
		"http://www.dy2018.com/16/",
		"http://www.dy2018.com/17/",
		"http://www.dy2018.com/18/",
		"http://www.dy2018.com/19/",
		"http://www.dy2018.com/20/");

		$i = 0;
		while($i  0){
				$crawlers_pid[$i] = $pid;
			} else {
				$url = $crawl_urls[$i];
				$con = mysql_connect("localhost", "root", "123456");
				if(!$con) {
					die('Count not connect: '.mysql_error());
				}
				mysql_select_db("mysql", $con);
				crawl_process($url);
				$finish_count++;
			}
			$i++;
		}

		//pcntl_waitpid可能会导致信号监听失败
		while (true) { 
			if($finish_count == count($crawlers_pid)) {
				echo "---------- crawl task finish ----------";
				mysql_close();
				exit();
			}
            sleep(1);
        }

	}

	//从入口链接到其下所有下载页链接抓取过程
	function crawl_process($url)
	{
		echo "start handle url:".$url;
		$page_idx = 1;
		$valid_tag = true;
		$info_url_pattern = '/\/i\/\d+.html/';
		$ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用
		while($valid_tag) {
			$page_url = get_page_index_url($url, $page_idx);
			printf("start crawl url:".$page_url."\n");
			$page_content = get_page_content($page_url);
			$valid_tag = is_valid_page($page_content);
			if($valid_tag) {
				$matches_urls = array();
				preg_match_all($info_url_pattern, $page_content, $matches_urls);
				$page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");
				for($i=0; $i<count get_page_content mb_convert_encoding preg_match_all array for array_values foreach as mysql_query into dy2018_url status values echo mysql_error sleep function get_page_index_url if else> 1){
			$idx_url = $idx_url.'index_'.$idx.'.html';
		}
		return $idx_url;
	}

	//根据页面内容判断链接是否有效
	function is_valid_page($content)
	{
		return $content?true:false;
	}
	run_dy2018();
	mysql_close();
?></count>

结果:

以上就介绍了爬虫_电影ftp下载地址,包括了方面的内容,希望对PHP教程有兴趣的朋友有所帮助。

Déclaration:
Le contenu de cet article est volontairement contribué par les internautes et les droits d'auteur appartiennent à l'auteur original. Ce site n'assume aucune responsabilité légale correspondante. Si vous trouvez un contenu suspecté de plagiat ou de contrefaçon, veuillez contacter admin@php.cn