php网页分析 内容抓取 爬虫 文件分析
//获取所有内容url保存到文件
function get_index($save_file, $prefix="index_"){
$count = 68;
$i = 1;
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, "a+") or die("Open ". $save_file ." failed");
while($i $url = $prefix . $i .".htm";
echo "Get ". $url ."...";
$url_str = get_content_url(get_url($url));
echo " OK\n";
fwrite($fp, $url_str);
++$i;
}
fclose($fp);
}
//获取目标多媒体对象
function get_object($url_file, $save_file, $split="|--:**:--|"){
if (!file_exists($url_file)) die($url_file ." not exist");
$file_arr = file($url_file);
if (!is_array($file_arr) || empty($file_arr)) die($url_file ." not content");
$url_arr = array_unique($file_arr);
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed");
foreach($url_arr as $url){
if (empty($url)) continue;
echo "Get ". $url ."...";
$html_str = get_url($url);
echo $html_str;
echo $url;
exit;
$obj_str = get_content_object($html_str);
echo " OK\n";
fwrite($fp, $obj_str);
}
fclose($fp);
}
//遍历目录获取文件内容
function get_dir($save_file, $dir){
$dp = opendir($dir);
if (file_exists($save_file)) @unlink($save_file);
$fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed");
while(($file = readdir($dp)) != false){
if ($file!="." && $file!=".."){
echo "Read file ". $file ."...";
$file_content = file_get_contents($dir . $file);
$obj_str = get_content_object($file_content);
echo " OK\n";
fwrite($fp, $obj_str);
}
}
fclose($fp);
}
//获取指定url内容
function get_url($url){
$reg = '/^http:\/\/[^\/].+$/';
if (!preg_match($reg, $url)) die($url ." invalid");
$fp = fopen($url, "r") or die("Open url: ". $url ." failed.");
while($fc = fread($fp, 8192)){
$content .= $fc;
}
fclose($fp);
if (empty($content)){
die("Get url: ". $url ." content failed.");
}
return $content;
}
//使用socket获取指定网页
function get_content_by_socket($url, $host){
$fp = fsockopen($host, 80) or die("Open ". $url ." failed");
$header = "GET /".$url ." HTTP/1.1\r\n";
$header .= "Accept: */*\r\n";
$header .= "Accept-Language: zh-cn\r\n";
$header .= "Accept-Encoding: gzip, deflate\r\n";
$header .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; InfoPath.1; .NET CLR 2.0.50727)\r\n";
$header .= "Host: ". $host ."\r\n";
$header .= "Connection: Keep-Alive\r\n";
//$header .= "Cookie: cnzz02=2; rtime=1; ltime=1148456424859; cnzz_eid=56601755-\r\n\r\n";
$header .= "Connection: Close\r\n\r\n";
fwrite($fp, $header);
while (!feof($fp)) {
$contents .= fgets($fp, 8192);
}
fclose($fp);
return $contents;
}
//获取指定内容里的url
function get_content_url($host_url, $file_contents){
//$reg = '/^(#|javascript.*?|ftp:\/\/.+|http:\/\/.+|.*?href.*?|play.*?|index.*?|.*?asp)+$/i';
//$reg = '/^(down.*?\.html|\d+_\d+\.htm.*?)$/i';
$rex = "/([hH][rR][eE][Ff])\s*=\s*['\"]*([^>'\"\s]+)[\"'>]*\s*/i";
$reg = '/^(down.*?\.html)$/i';
preg_match_all ($rex, $file_contents, $r);
$result = ""; //array();
foreach($r as $c){
if (is_array($c)){
foreach($c as $d){
if (preg_match($reg, $d)){ $result .= $host_url . $d."\n"; }
}
}
}
return $result;
}
//获取指定内容中的多媒体文件
function get_content_object($str, $split="|--:**:--|"){
$regx = "/href\s*=\s*['\"]*([^>'\"\s]+)[\"'>]*\s*(.*?)/i";
preg_match_all($regx, $str, $result);
if (count($result) == 3){
$result[2] = str_replace("多媒体: ", "", $result[2]);
$result[2] = str_replace("", "", $result[2]);
$result = $result[1][0] . $split .$result[2][0] . "\n";
}
return $result;
}
?>
//PHP 访问网页
$page = '';
$handler = fopen('http://www.baidu.com','r');
while(!feof($handler)){
$page.=fread($handler,1048576);
}
fclose($handler);
echo $page;
?>
2:判断这个页面是否是报错页面
/**
* $host 服务器地址
* $get 请求页面
*
*/
function getHttpStatus($host,$get="") {
$fp = fsockopen($host, 80);
if (!$fp) {
$res= -1;
} else {
fwrite($fp, "GET /".$get." HTTP/1.0\r\n\r\n");
stream_set_timeout($fp, 2);
$res = fread($fp, 128);
$info = stream_get_meta_data($fp);
fclose($fp);
if ($info['timed_out']) {
$res=0;
} else {
$res= substr($res,9,3);
}
}
return $res;
}
$good=array("200","302");
if(in_array(getHttpStatus("5y.nuc.edu.cn","/"),$good)) {
echo "正常";
} else {
echo getHttpStatus("5y.nuc.edu.cn","/");
}
if(in_array(getHttpStatus("5y.nuc.edu.cn","/error.php"),$good)) {
echo "正常";
} else {
echo getHttpStatus("5y.nuc.edu.cn","/error.php");
}
?>
返回
第一个返回"正常"
第二个不存在返回"404"
function getHttpStatus($host,$get="") {
//访问网页 获得服务器状态码
$fp = fsockopen($host, 80);
if (!$fp) {
$res= -1;
} else {
fwrite($fp, "GET /".$get." HTTP/1.0\r\n\r\n");
stream_set_timeout($fp, 2);
$res = fread($fp, 128);
$info = stream_get_meta_data($fp);
fclose($fp);
if ($info['timed_out']) {
$res=0;
} else {
$res= substr($res,9,3);
}
}
return $res;
}
echo getHttpStatus("5y.nuc.edu.cn","/");
echo getHttpStatus("community.csdn.net","Expert/topic/4758/4758574.xml?temp=.1661646");
返回
1 无法连接服务器
0 超时
200 OK (成功返回)
302 Found (找到)
404 没有找到
...
//遍历所有网页 ($type指定类型)
function getAllPage($path="./",$type=array("html","htm")) {
global $p;
if ($handle = opendir($path)) {
while (false !== ($file = readdir($handle))) {
if(is_dir($file) && $file!="." && $file!="..") {
getAllPage($path.$file."/",$type);
} else {
$ex=array_pop(explode(".",$file));
if(in_array(strtolower($ex),$type)) {
array_push($p, $path.$file);
}
}
}
closedir($handle);
}
}
$p=array();
getAllPage("./");
echo ""; <br>print_r($p); <br>echo "
";
?>
//抓取页面内容中所有URL。
$str='2006年05月30日 15:13:00 | 评论 (0)';
preg_match_all("/href=\"([^\"]*\.*php[^\"]*)\"/si",$str,$m);
//换用下面这个获取所有类型URL
//preg_match_all("/href=\"([^\"]*)\"/si",$str,$m);
print_r($m[1]);
?>
包含在链接里带get参数的地址
$str=file_get_contents("http://www.php.net");
preg_match_all("/href=\"([^\"]*\.*php\?[^\"]*)\"/si",$str,$m);
print_r($m[1]);
?>

ThesecrettokeepingaPHP-poweredwebsiterunningsmoothlyunderheavyloadinvolvesseveralkeystrategies:1)ImplementopcodecachingwithOPcachetoreducescriptexecutiontime,2)UsedatabasequerycachingwithRedistolessendatabaseload,3)LeverageCDNslikeCloudflareforservin

You should care about DependencyInjection(DI) because it makes your code clearer and easier to maintain. 1) DI makes it more modular by decoupling classes, 2) improves the convenience of testing and code flexibility, 3) Use DI containers to manage complex dependencies, but pay attention to performance impact and circular dependencies, 4) The best practice is to rely on abstract interfaces to achieve loose coupling.

Yes,optimizingaPHPapplicationispossibleandessential.1)ImplementcachingusingAPCutoreducedatabaseload.2)Optimizedatabaseswithindexing,efficientqueries,andconnectionpooling.3)Enhancecodewithbuilt-infunctions,avoidingglobalvariables,andusingopcodecaching

ThekeystrategiestosignificantlyboostPHPapplicationperformanceare:1)UseopcodecachinglikeOPcachetoreduceexecutiontime,2)Optimizedatabaseinteractionswithpreparedstatementsandproperindexing,3)ConfigurewebserverslikeNginxwithPHP-FPMforbetterperformance,4)

APHPDependencyInjectionContainerisatoolthatmanagesclassdependencies,enhancingcodemodularity,testability,andmaintainability.Itactsasacentralhubforcreatingandinjectingdependencies,thusreducingtightcouplingandeasingunittesting.

Select DependencyInjection (DI) for large applications, ServiceLocator is suitable for small projects or prototypes. 1) DI improves the testability and modularity of the code through constructor injection. 2) ServiceLocator obtains services through center registration, which is convenient but may lead to an increase in code coupling.

PHPapplicationscanbeoptimizedforspeedandefficiencyby:1)enablingopcacheinphp.ini,2)usingpreparedstatementswithPDOfordatabasequeries,3)replacingloopswitharray_filterandarray_mapfordataprocessing,4)configuringNginxasareverseproxy,5)implementingcachingwi

PHPemailvalidationinvolvesthreesteps:1)Formatvalidationusingregularexpressionstochecktheemailformat;2)DNSvalidationtoensurethedomainhasavalidMXrecord;3)SMTPvalidation,themostthoroughmethod,whichchecksifthemailboxexistsbyconnectingtotheSMTPserver.Impl


Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

Video Face Swap
Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Article

Hot Tools

SublimeText3 Chinese version
Chinese version, very easy to use

Notepad++7.3.1
Easy-to-use and free code editor

SublimeText3 Linux new version
SublimeText3 Linux latest version

MantisBT
Mantis is an easy-to-deploy web-based defect tracking tool designed to aid in product defect tracking. It requires PHP, MySQL and a web server. Check out our demo and hosting services.

SAP NetWeaver Server Adapter for Eclipse
Integrate Eclipse with SAP NetWeaver application server.
