(转)php抓取网页内容汇总
①、使用php获取网页内容
http://hi.baidu.com/quqiufeng/blog/item/7e86fb3f40b598c67d1e7150.html
header("Content-type: text/html; charset=utf-8");
1、
$xhr = new COM("MSXML2.XMLHTTP");
$xhr->open("GET","http://localhost/xxx.php?id=2",false);
$xhr->send();
echo $xhr->responseText
2、file_get_contents实现
$url="http://www.blogjava.net/pts";
echo file_get_contents( $url );
?>
3、fopen()实现
if ($stream = fopen('http://www.sohu.com', 'r')) {
??? // print all the page starting at the offset 10
??? echo stream_get_contents($stream, -1, 10);
??? fclose($stream);
}
if ($stream = fopen('http://www.sohu.net', 'r')) {
??? // print the first 5 bytes
??? echo stream_get_contents($stream, 5);
??? fclose($stream);
}
?>
②、使用php获取网页内容
http://www.blogjava.net/pts/archive/2007/08/26/99188.html
简单的做法:
$url="http://www.blogjava.net/pts";
echo file_get_contents( $url );
?>
或者:
if ($stream = fopen('http://www.sohu.com', 'r')) {
??? // print all the page starting at the offset 10
??? echo stream_get_contents($stream, -1, 10);
??? fclose($stream);
}
if ($stream = fopen('http://www.sohu.net', 'r')) {
??? // print the first 5 bytes
??? echo stream_get_contents($stream, 5);
??? fclose($stream);
}
?>
③、PHP获取网站内容,保存为TXT文件源码
http://blog.chinaunix.net/u1/44325/showart_348444.html
$my_book_url='http://book.yunxiaoge.com/files/article/html/4/4550/index.html';
ereg("http://book.yunxiaoge.com/files/article/html/[0-9]+/[0-9]+/",$my_book_url,$myBook);
$my_book_txt=$myBook[0];
$file_handle = fopen($my_book_url, "r");//读取文件
unlink("test.txt");
while (!feof($file_handle)) { //循环到文件结束
??? $line = fgets($file_handle); //读取一行文件
??? $line1=ereg("href=\"[0-9]+.html",$line,$reg); //分析文件内部书的文章页面
?????? $handle = fopen("test.txt", 'a');
?? if ($line1) {
???? $my_book_txt_url=$reg[0]; //另外赋值,给抓取分析做准备
?? $my_book_txt_url=str_replace("href=\"","",$my_book_txt_url);
????? $my_book_txt_over_url="$my_book_txt$my_book_txt_url"; //转换为抓取地址
????? echo "$my_book_txt_over_url
下面是Snoopy的Readme
NAME:
??? Snoopy - the PHP net client v1.2.4
???
SYNOPSIS:
??? include "Snoopy.class.php";
??? $snoopy = new Snoopy;
???
??? $snoopy->fetchtext("http://www.php.net/");
??? print $snoopy->results;
???
??? $snoopy->fetchlinks("http://www.phpbuilder.com/");
??? print $snoopy->results;
???
??? $submit_url = "http://lnk.ispi.net/texis/scripts/msearch/netsearch.html";
???
??? $submit_vars["q"] = "amiga";
??? $submit_vars["submit"] = "Search!";
??? $submit_vars["searchhost"] = "Altavista";
??? ???
??? $snoopy->submit($submit_url,$submit_vars);
??? print $snoopy->results;
???
??? $snoopy->maxframes=5;
??? $snoopy->fetch("http://www.ispi.net/");
??? echo "
\n";<br>??? echo htmlentities($snoopy->results[0]);<br>??? echo htmlentities($snoopy->results[1]);<br>??? echo htmlentities($snoopy->results[2]);<br>??? echo "\n";
\n";
??? ???
??? ??? echo "
".htmlspecialchars($snoopy->results)."\n";
\n";
??? ???
??? ??? echo "
".htmlspecialchars($snoopy->results)."\n";
\n";
??? ???
??? ??? echo "
".htmlspecialchars($snoopy->results)."\n";
".htmlspecialchars($snoopy->results[0])."\n";
".htmlspecialchars($snoopy->results[1])."\n";
".htmlspecialchars($snoopy->results[2])."\n";
?
?
<?php //获取所有内容url保存到文件function get_index($save_file, $prefix="index_"){ $count = 68; $i = 1; if (file_exists($save_file)) @unlink($save_file); $fp = fopen($save_file, "a+") or die("Open ". $save_file ." failed"); while($i<$count){ $url = $prefix . $i .".htm"; echo "Get ". $url ."..."; $url_str = get_content_url(get_url($url)); echo " OKn"; fwrite($fp, $url_str); ++$i; } fclose($fp);}//获取目标多媒体对象function get_object($url_file, $save_file, $split="|--:**:--|"){ if (!file_exists($url_file)) die($url_file ." not exist"); $file_arr = file($url_file); if (!is_array($file_arr) || empty($file_arr)) die($url_file ." not content"); $url_arr = array_unique($file_arr); if (file_exists($save_file)) @unlink($save_file); $fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed"); foreach($url_arr as $url){ if (empty($url)) continue; echo "Get ". $url ."..."; $html_str = get_url($url); echo $html_str; echo $url; exit; $obj_str = get_content_object($html_str); echo " OKn"; fwrite($fp, $obj_str); } fclose($fp);}//遍历目录获取文件内容function get_dir($save_file, $dir){ $dp = opendir($dir); if (file_exists($save_file)) @unlink($save_file); $fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed"); while(($file = readdir($dp)) != false){ if ($file!="." && $file!=".."){ echo "Read file ". $file ."..."; $file_content = file_get_contents($dir . $file); $obj_str = get_content_object($file_content); echo " OKn"; fwrite($fp, $obj_str); } } fclose($fp);}//获取指定url内容function get_url($url){ $reg = '/^http://[^/].+$/'; if (!preg_match($reg, $url)) die($url ." invalid"); $fp = fopen($url, "r") or die("Open url: ". $url ." failed."); while($fc = fread($fp, 8192)){ $content .= $fc; } fclose($fp); if (empty($content)){ die("Get url: ". $url ." content failed."); } return $content;}//使用socket获取指定网页function get_content_by_socket($url, $host){ $fp = fsockopen($host, 80) or die("Open ". $url ." failed"); $header = "GET /".$url ." HTTP/1.1rn"; $header .= "Accept: */*rn"; $header .= "Accept-Language: zh-cnrn"; $header .= "Accept-Encoding: gzip, deflatern"; $header .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; InfoPath.1; .NET CLR 2.0.50727)rn"; $header .= "Host: ". $host ."rn"; $header .= "Connection: Keep-Alivern"; //$header .= "Cookie: cnzz02=2; rtime=1; ltime=1148456424859; cnzz_eid=56601755-rnrn"; $header .= "Connection: Closernrn"; fwrite($fp, $header); while (!feof($fp)) { $contents .= fgets($fp, 8192); } fclose($fp); return $contents;}//获取指定内容里的urlfunction get_content_url($host_url, $file_contents){ //$reg = '/^(#|javascript.*?|ftp://.+|http://.+|.*?href.*?|play.*?|index.*?|.*?asp)+$/i'; //$reg = '/^(down.*?.html|d+_d+.htm.*?)$/i'; $rex = "/([hH][rR][eE][Ff])s*=s*['"]*([^>'"s]+)["'>]*s*/i"; $reg = '/^(down.*?.html)$/i'; preg_match_all ($rex, $file_contents, $r); $result = ""; //array(); foreach($r as $c){ if (is_array($c)){ foreach($c as $d){ if (preg_match($reg, $d)){ $result .= $host_url . $d."n"; } } } } return $result;}//获取指定内容中的多媒体文件function get_content_object($str, $split="|--:**:--|"){ $regx = "/hrefs*=s*['"]*([^>'"s]+)["'>]*s*(<b>.*?</b>)/i"; preg_match_all($regx, $str, $result); if (count($result) == 3){ $result[2] = str_replace("<b>多媒体: ", "", $result[2]); $result[2] = str_replace("</b>", "", $result[2]); $result = $result[1][0] . $split .$result[2][0] . "n"; } return $result;}?>
标签:php抓取图片it |
分类: PHP |
1. 取得指定網頁內的所有圖片:
//取得指定位址的內容,並儲存至text
$text=file_get_contents('http://andy.diimii.com/');
//取得第一個img標籤,並儲存至陣列match(regex語法與上述同義)
preg_match('/]*>/Ui',$text, $match);
//印出match
print_r($match);
?>
-----------------
2. 取得指定網頁內的第一張圖片:
//取得指定位址的內容,並儲存至text
$text=file_get_contents('http://andy.diimii.com/');
//取得第一個img標籤,並儲存至陣列match(regex語法與上述同義)
preg_match('/]*>/Ui',$text, $match);
//印出match
print_r($match);
?>
------------------------------------
3. 取得指定網頁內的特定div區塊(藉由id判斷):
//取得指定位址的內容,並儲存至text
$text=file_get_contents('http://andy.diimii.com/2009/01/seo%e5%8c%96%e7%9a%84%e9%97%9c%e9%8d%b5%e5%ad%97%e5%bb%a3%e5%91%8a%e9%80%a3%e7%b5%90/');
//去除換行及空白字元(序列化內容才需使用)
//$text=str_replace(array("\r","\n","\t","\s"),'', $text);? ?
//取出div標籤且id為PostContent的內容,並儲存至陣列match
preg_match('/
//印出match[0]
print($match[0]);
?>
-------------------------------------------
4. 上述2及3的結合:
//取得指定位址的內容,並儲存至text
$text=file_get_contents('http://andy.diimii.com/2009/01/seo%e5%8c%96%e7%9a%84%e9%97%9c%e9%8d%b5%e5%ad%97%e5%bb%a3%e5%91%8a%e9%80%a3%e7%b5%90/');???
//取出div標籤且id為PostContent的內容,並儲存至陣列match
preg_match('/
//取得第一個img標籤,並儲存至陣列match2
preg_