search
HomeBackend DevelopmentPHP Tutorialphp网页分析 内容抓取 爬虫 资料分析

php网页分析 内容抓取 爬虫 文件分析

//获取所有内容url保存到文件
function get_index($save_file, $prefix="index_"){
    $count = 68;
    $i = 1;
    if (file_exists($save_file)) @unlink($save_file);
    $fp = fopen($save_file, "a+") or die("Open ". $save_file ." failed");
    while($i        $url = $prefix . $i .".htm";
        echo "Get ". $url ."...";
        $url_str = get_content_url(get_url($url));
        echo " OK\n";
        fwrite($fp, $url_str);
        ++$i;
    }
    fclose($fp);
}

//获取目标多媒体对象
function get_object($url_file, $save_file, $split="|--:**:--|"){
    if (!file_exists($url_file)) die($url_file ." not exist");
    $file_arr = file($url_file);
    if (!is_array($file_arr) || empty($file_arr)) die($url_file ." not content");
    $url_arr = array_unique($file_arr);
    if (file_exists($save_file)) @unlink($save_file);
    $fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed");
    foreach($url_arr as $url){
        if (empty($url)) continue;
        echo "Get ". $url ."...";
        $html_str = get_url($url);
        echo $html_str;
        echo $url;
        exit;
        $obj_str = get_content_object($html_str);
        echo " OK\n";
        fwrite($fp, $obj_str);
    }
    fclose($fp);
}

//遍历目录获取文件内容
function get_dir($save_file, $dir){
    $dp = opendir($dir);
    if (file_exists($save_file)) @unlink($save_file);
    $fp = fopen($save_file, "a+") or die("Open save file ". $save_file ." failed");
    while(($file = readdir($dp)) != false){
        if ($file!="." && $file!=".."){
            echo "Read file ". $file ."...";
            $file_content = file_get_contents($dir . $file);
            $obj_str = get_content_object($file_content);
            echo " OK\n";
            fwrite($fp, $obj_str);
        }
    }
    fclose($fp);
}


//获取指定url内容
function get_url($url){
    $reg = '/^http:\/\/[^\/].+$/';
    if (!preg_match($reg, $url)) die($url ." invalid");
    $fp = fopen($url, "r") or die("Open url: ". $url ." failed.");
    while($fc = fread($fp, 8192)){
        $content .= $fc;
    }
    fclose($fp);
    if (empty($content)){
        die("Get url: ". $url ." content failed.");
    }
    return $content;
}

//使用socket获取指定网页
function get_content_by_socket($url, $host){
    $fp = fsockopen($host, 80) or die("Open ". $url ." failed");
    $header = "GET /".$url ." HTTP/1.1\r\n";
    $header .= "Accept: */*\r\n";
    $header .= "Accept-Language: zh-cn\r\n";
    $header .= "Accept-Encoding: gzip, deflate\r\n";
    $header .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; InfoPath.1; .NET CLR 2.0.50727)\r\n";
    $header .= "Host: ". $host ."\r\n";
    $header .= "Connection: Keep-Alive\r\n";
    //$header .= "Cookie: cnzz02=2; rtime=1; ltime=1148456424859; cnzz_eid=56601755-\r\n\r\n";
    $header .= "Connection: Close\r\n\r\n";

    fwrite($fp, $header);
    while (!feof($fp)) {
        $contents .= fgets($fp, 8192);
    }
    fclose($fp);
    return $contents;
}


//获取指定内容里的url
function get_content_url($host_url, $file_contents){

    //$reg = '/^(#|javascript.*?|ftp:\/\/.+|http:\/\/.+|.*?href.*?|play.*?|index.*?|.*?asp)+$/i';
    //$reg = '/^(down.*?\.html|\d+_\d+\.htm.*?)$/i';
    $rex = "/([hH][rR][eE][Ff])\s*=\s*['\"]*([^>'\"\s]+)[\"'>]*\s*/i";
    $reg = '/^(down.*?\.html)$/i';
    preg_match_all ($rex, $file_contents, $r);
    $result = ""; //array();
    foreach($r as $c){
        if (is_array($c)){
            foreach($c as $d){
                if (preg_match($reg, $d)){ $result .= $host_url . $d."\n"; }
            }
        }
    }
    return $result;
}

//获取指定内容中的多媒体文件
function get_content_object($str, $split="|--:**:--|"){   
    $regx = "/href\s*=\s*['\"]*([^>'\"\s]+)[\"'>]*\s*(.*?)/i";
    preg_match_all($regx, $str, $result);

    if (count($result) == 3){
        $result[2] = str_replace("多媒体: ", "", $result[2]);
        $result[2] = str_replace("
", "", $result[2]);
        $result = $result[1][0] . $split .$result[2][0] . "\n";
    }
    return $result;
}

?>

//PHP 访问网页
$page = '';
$handler = fopen('http://www.baidu.com','r');
while(!feof($handler)){
   $page.=fread($handler,1048576);
}
fclose($handler);
echo $page;
?>


2:判断这个页面是否是报错页面 
  
/** 
      *   $host   服务器地址 
      *   $get   请求页面 
      * 
      */ 
function   getHttpStatus($host,$get="")   { 
$fp   =   fsockopen($host,   80); 
if   (!$fp)   { 
          $res=   -1; 
}   else   { 
  
          fwrite($fp,   "GET   /".$get."   HTTP/1.0\r\n\r\n"); 
          stream_set_timeout($fp,   2); 
          $res   =   fread($fp,   128); 
          $info   =   stream_get_meta_data($fp); 
          fclose($fp); 
          if   ($info['timed_out'])   { 
                  $res=0; 
          }   else   { 
                  $res=   substr($res,9,3); 
          } 

return   $res; 

  
$good=array("200","302"); 
if(in_array(getHttpStatus("5y.nuc.edu.cn","/"),$good))   { 
echo   "正常"; 
}   else   { 
echo   getHttpStatus("5y.nuc.edu.cn","/"); 

  
if(in_array(getHttpStatus("5y.nuc.edu.cn","/error.php"),$good))   { 
echo   "正常"; 
}   else   { 
echo   getHttpStatus("5y.nuc.edu.cn","/error.php"); 

?> 
  
返回 
第一个返回"正常" 
第二个不存在返回"404" 

function   getHttpStatus($host,$get="")   {
//访问网页 获得服务器状态码
$fp   =   fsockopen($host,   80); 
if   (!$fp)   { 
          $res=   -1; 
}   else   { 
  
          fwrite($fp,   "GET   /".$get."   HTTP/1.0\r\n\r\n"); 
          stream_set_timeout($fp,   2); 
          $res   =   fread($fp,   128); 
          $info   =   stream_get_meta_data($fp); 
          fclose($fp); 
          if   ($info['timed_out'])   { 
                  $res=0; 
          }   else   { 
                  $res=   substr($res,9,3); 
          } 

return   $res; 

  
echo   getHttpStatus("5y.nuc.edu.cn","/"); 
echo   getHttpStatus("community.csdn.net","Expert/topic/4758/4758574.xml?temp=.1661646"); 
  
返回 
1   无法连接服务器 
0   超时 
200   OK   (成功返回) 
302   Found   (找到) 
404   没有找到 
...

  
//遍历所有网页   ($type指定类型) 
function   getAllPage($path="./",$type=array("html","htm"))   { 
global   $p; 
if   ($handle   =   opendir($path))   { 
          while   (false   !==   ($file   =   readdir($handle)))   { 
              if(is_dir($file)   &&   $file!="."   &&   $file!="..")   { 
              getAllPage($path.$file."/",$type); 
              }   else   { 
                  $ex=array_pop(explode(".",$file)); 
                  
                  if(in_array(strtolower($ex),$type))   { 
                    array_push($p,   $path.$file); 
                  } 
                  } 
          } 
                  closedir($handle); 


  
$p=array(); 
getAllPage("./"); 
echo   "

";  <br>print_r($p);  <br>echo   "
"; 
  
?>


//抓取页面内容中所有URL。
$str='2006年05月30日   15:13:00   |   评论   (0)'; 
  
preg_match_all("/href=\"([^\"]*\.*php[^\"]*)\"/si",$str,$m); 
//换用下面这个获取所有类型URL 
//preg_match_all("/href=\"([^\"]*)\"/si",$str,$m); 
  
print_r($m[1]); 
?> 

包含在链接里带get参数的地址 
$str=file_get_contents("http://www.php.net"); 
preg_match_all("/href=\"([^\"]*\.*php\?[^\"]*)\"/si",$str,$m); 
print_r($m[1]); 
?>

Statement
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Beyond the Hype: Assessing PHP's Role TodayBeyond the Hype: Assessing PHP's Role TodayApr 12, 2025 am 12:17 AM

PHP remains a powerful and widely used tool in modern programming, especially in the field of web development. 1) PHP is easy to use and seamlessly integrated with databases, and is the first choice for many developers. 2) It supports dynamic content generation and object-oriented programming, suitable for quickly creating and maintaining websites. 3) PHP's performance can be improved by caching and optimizing database queries, and its extensive community and rich ecosystem make it still important in today's technology stack.

What are Weak References in PHP and when are they useful?What are Weak References in PHP and when are they useful?Apr 12, 2025 am 12:13 AM

In PHP, weak references are implemented through the WeakReference class and will not prevent the garbage collector from reclaiming objects. Weak references are suitable for scenarios such as caching systems and event listeners. It should be noted that it cannot guarantee the survival of objects and that garbage collection may be delayed.

Explain the __invoke magic method in PHP.Explain the __invoke magic method in PHP.Apr 12, 2025 am 12:07 AM

The \_\_invoke method allows objects to be called like functions. 1. Define the \_\_invoke method so that the object can be called. 2. When using the $obj(...) syntax, PHP will execute the \_\_invoke method. 3. Suitable for scenarios such as logging and calculator, improving code flexibility and readability.

Explain Fibers in PHP 8.1 for concurrency.Explain Fibers in PHP 8.1 for concurrency.Apr 12, 2025 am 12:05 AM

Fibers was introduced in PHP8.1, improving concurrent processing capabilities. 1) Fibers is a lightweight concurrency model similar to coroutines. 2) They allow developers to manually control the execution flow of tasks and are suitable for handling I/O-intensive tasks. 3) Using Fibers can write more efficient and responsive code.

The PHP Community: Resources, Support, and DevelopmentThe PHP Community: Resources, Support, and DevelopmentApr 12, 2025 am 12:04 AM

The PHP community provides rich resources and support to help developers grow. 1) Resources include official documentation, tutorials, blogs and open source projects such as Laravel and Symfony. 2) Support can be obtained through StackOverflow, Reddit and Slack channels. 3) Development trends can be learned by following RFC. 4) Integration into the community can be achieved through active participation, contribution to code and learning sharing.

PHP vs. Python: Understanding the DifferencesPHP vs. Python: Understanding the DifferencesApr 11, 2025 am 12:15 AM

PHP and Python each have their own advantages, and the choice should be based on project requirements. 1.PHP is suitable for web development, with simple syntax and high execution efficiency. 2. Python is suitable for data science and machine learning, with concise syntax and rich libraries.

PHP: Is It Dying or Simply Adapting?PHP: Is It Dying or Simply Adapting?Apr 11, 2025 am 12:13 AM

PHP is not dying, but constantly adapting and evolving. 1) PHP has undergone multiple version iterations since 1994 to adapt to new technology trends. 2) It is currently widely used in e-commerce, content management systems and other fields. 3) PHP8 introduces JIT compiler and other functions to improve performance and modernization. 4) Use OPcache and follow PSR-12 standards to optimize performance and code quality.

The Future of PHP: Adaptations and InnovationsThe Future of PHP: Adaptations and InnovationsApr 11, 2025 am 12:01 AM

The future of PHP will be achieved by adapting to new technology trends and introducing innovative features: 1) Adapting to cloud computing, containerization and microservice architectures, supporting Docker and Kubernetes; 2) introducing JIT compilers and enumeration types to improve performance and data processing efficiency; 3) Continuously optimize performance and promote best practices.

See all articles

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

AI Hentai Generator

AI Hentai Generator

Generate AI Hentai for free.

Hot Article

R.E.P.O. Energy Crystals Explained and What They Do (Yellow Crystal)
3 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. Best Graphic Settings
3 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. How to Fix Audio if You Can't Hear Anyone
3 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
WWE 2K25: How To Unlock Everything In MyRise
4 weeks agoBy尊渡假赌尊渡假赌尊渡假赌

Hot Tools

SAP NetWeaver Server Adapter for Eclipse

SAP NetWeaver Server Adapter for Eclipse

Integrate Eclipse with SAP NetWeaver application server.

PhpStorm Mac version

PhpStorm Mac version

The latest (2018.2.1) professional PHP integrated development tool

Safe Exam Browser

Safe Exam Browser

Safe Exam Browser is a secure browser environment for taking online exams securely. This software turns any computer into a secure workstation. It controls access to any utility and prevents students from using unauthorized resources.

Dreamweaver Mac version

Dreamweaver Mac version

Visual web development tools

MinGW - Minimalist GNU for Windows

MinGW - Minimalist GNU for Windows

This project is in the process of being migrated to osdn.net/projects/mingw, you can continue to follow us there. MinGW: A native Windows port of the GNU Compiler Collection (GCC), freely distributable import libraries and header files for building native Windows applications; includes extensions to the MSVC runtime to support C99 functionality. All MinGW software can run on 64-bit Windows platforms.