Home >Backend Development >PHP Tutorial >PHP get webpage title and content function (excluding html tags)_PHP tutorial

PHP get webpage title and content function (excluding html tags)_PHP tutorial

WBOY
WBOYOriginal
2016-07-13 10:39:57940browse

Copy code The code is as follows:

function getPageContent($url) {  

        //$url='http://www.ttphp.com;  

        $pageinfo = array();  
        $pageinfo[content_type] = '';  
        $pageinfo[charset] = '';  
        $pageinfo[title] = '';  
        $pageinfo[description] = '';  
        $pageinfo[keywords] = '';  
        $pageinfo[body] = '';  
        $pageinfo['httpcode'] = 200;  
        $pageinfo['all'] = '';   

        $ch = curl_init();  
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");  
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);  
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);  
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);  
        curl_setopt($ch, CURLOPT_TIMEOUT, 8);  
        curl_setopt($ch, CURLOPT_FILETIME, 1);  
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
        //curl_setopt($ch, CURLOPT_HEADER, 1);        
        curl_setopt($ch, CURLOPT_URL,$url);  

        $curl_start = microtime(true);  
        $store = curl_exec ($ch);  

        $curl_time = microtime(true) - $curl_start;  
        if( curl_error($ch) ) {  
            $pageinfo['httpcode'] = 505;  //gate way error  
            echo 'Curl error: ' . curl_error($ch) ."/n";  
            return $pageinfo;  
        }  

        //print_r(curl_getinfo($ch));  
        $pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);  
        //echo curl_getinfo($ch,CURLINFO_CONTENT_TYPE)."/n";  
        $pageinfo[content_type] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);  
        if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE) )   ) {  
                //print_r(curl_getinfo($ch) );  
                //exit;  
                return $pageinfo;  
        }  
        preg_match('/charset=([^/s/n/r]+)/i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset  
        if( trim($matches[1]) ) {  
            $pageinfo[charset] = trim($matches[1]);  
        }  
        //echo $pageinfo[charset];  
        //exit;  
        curl_close ($ch);  
        //echo $store;  

 
        //remove javascript  
        $store = preg_replace("/  
        $store = preg_replace("//smUi",'',$store);  
        //remove   
        $store = preg_replace("/(.*)/smUi",'',$store);  
        //remove 中文空格  
        $store = preg_replace("/ /",'',$store);  
        //remove 标点符号  
        //$store = preg_replace("/[/~`!@#$%^&*()_/-+={}|/[/]//;':"//?/,/.//]/",'',$store);  

          
        //preg_match("/(.*)/smUi",$store, $matches);  
        //$head = $matches[1];  
        //echo $head. "/n";  

        //charset  
        if($pageinfo[charset] == '' ) { 
            preg_match('@]*>@i',$store,$matches); 
            $pageinfo[charset] = trim($matches[1]); 
        } 
        //desctiption 
        preg_match('@]+)/*>@i',$store,$matches); 
        //print_r($matches); 
        $desc = trim($matches[1]); 
        $pageinfo[description] = str_replace("/"", '',$desc); 

 
        preg_match('@]+)/*>@i',$store,$matches); 
        //print_r($matches); 
        $keywords = trim($matches[1]); 
        $pageinfo[keywords] = str_replace("/"", '',$keywords); 

         
        preg_match("/(.*)<//title>/smUi",$store, $matches);  <br>        $pageinfo[title] = trim($matches[1]);  <br><br>        preg_match("/<body.*>(.*)<//body>/smUi",$store, $matches);  <br>        $pageinfo[body] = addslashes( replaceHtmlAndJs($matches[1]) ) ;  <br>        $pageinfo['all'] = addslashes( replaceHtmlAndJs($store) ) ;  <br><br>        //echo "charset = " . $pageinfo[charset] . "/n";  <br><br>        //print_r($pageinfo);  <br>        //exit;  <br><br>          <br>        return $pageinfo;  <br><br>}<br><br>/**<br> * Remove all HTML tags and JavaScript tags <br>*/ <br>function replaceHtmlAndJs($document) <br>{ <br> $document = trim($document); <br> if (strlen($document) < ;= 0) <br>                                                                                   "'<script[^>]*?>.*?<br> // --></mce:script>'si", // Remove javascript <br> "'<[///!]*?[^<>]*?>'si" ,                                                                                                                                       (/w+);'i" // Replace HTML entity <br> ) ; ; <br><br>} <br><br><br><br> <br>Usage examples<br> <br><br><br><br>Copy code<br> </div> The code is as follows:<br><br><p>$a = getPageContent(www.ttphp.com); </p>print_r($ a); <p></p> <div class="codetitle"><span style="CURSOR: pointer" onclick="doCopy('code14066')"> <u> </u></span></div>http://www.bkjia.com/PHPjc/728081.html<div class="code" id="code14066"> <br>www.bkjia.com<br><br>true</div>http: //www.bkjia.com/PHPjc/728081.html<p align="left">TechArticle</p> <div style="display:none;"> <span id="url" itemprop="url">Copy the code The code is as follows: function getPageContent($url) { //$url='http://www. ttphp.com; $pageinfo = array(); $pageinfo[content_type] = ''; $pageinfo[charset] = ''; $pageinfo[title...</span><span id="indexUrl" itemprop="indexUrl"></span><span id="isOriginal" itemprop="isOriginal"> </span><span id="isBasedOnUrl" itemprop="isBasedOnUrl"></span> </div></div><div class="nphpQianMsg"><div class="clear"></div></div><div class="nphpQianSheng"><span>Statement:</span><div>The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn</div></div></div><div class="nphpSytBox"><span>Previous article:<a class="dBlack" title="Get the dates of last month, next month, and this month (strtotime, date) under PHP_PHP tutorial" href="https://m.php.cn/faq/296961.html">Get the dates of last month, next month, and this month (strtotime, date) under PHP_PHP tutorial</a></span><span>Next article:<a class="dBlack" title="Get the dates of last month, next month, and this month (strtotime, date) under PHP_PHP tutorial" href="https://m.php.cn/faq/296963.html">Get the dates of last month, next month, and this month (strtotime, date) under PHP_PHP tutorial</a></span></div><div class="nphpSytBox2"><div class="nphpZbktTitle"><h2>Related articles</h2><em><a href="https://m.php.cn/article.html" class="bBlack"><i>See more</i><b></b></a></em><div class="clear"></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="fluid" data-ad-layout-key="-6t+ed+2i-1n-4w" data-ad-client="ca-pub-5902227090019525" data-ad-slot="8966999616"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><ul class="nphpXgwzList"><li><b></b><a href="https://m.php.cn/faq/1.html" title="How to use cURL to implement Get and Post requests in PHP" class="aBlack">How to use cURL to implement Get and Post requests in PHP</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/faq/1.html" title="How to use cURL to implement Get and Post requests in PHP" class="aBlack">How to use cURL to implement Get and Post requests in PHP</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/faq/1.html" title="How to use cURL to implement Get and Post requests in PHP" class="aBlack">How to use cURL to implement Get and Post requests in PHP</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/faq/1.html" title="How to use cURL to implement Get and Post requests in PHP" class="aBlack">How to use cURL to implement Get and Post requests in PHP</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/faq/2.html" title="All expression symbols in regular expressions (summary)" class="aBlack">All expression symbols in regular expressions (summary)</a><div class="clear"></div></li></ul></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="autorelaxed" data-ad-client="ca-pub-5902227090019525" data-ad-slot="5027754603"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><footer><div class="footer"><div class="footertop"><img src="/static/imghwm/logo.png" alt=""><p>Public welfare online PHP training,Help PHP learners grow quickly!</p></div><div class="footermid"><a href="https://m.php.cn/about/us.html">About us</a><a href="https://m.php.cn/about/disclaimer.html">Disclaimer</a><a href="https://m.php.cn/update/article_0_1.html">Sitemap</a></div><div class="footerbottom"><p> © php.cn All rights reserved </p></div></div></footer><script>isLogin = 0;</script><script type="text/javascript" src="/static/layui/layui.js"></script><script type="text/javascript" src="/static/js/global.js?4.9.47"></script></div><script src="https://vdse.bdstatic.com//search-video.v1.min.js"></script><link rel='stylesheet' id='_main-css' href='/static/css/viewer.min.css' type='text/css' media='all'/><script type='text/javascript' src='/static/js/viewer.min.js?1'></script><script type='text/javascript' src='/static/js/jquery-viewer.min.js'></script><script>jQuery.fn.wait = function (func, times, interval) { var _times = times || -1, //100次 _interval = interval || 20, //20毫秒每次 _self = this, _selector = this.selector, //选择器 _iIntervalID; //定时器id if( this.length ){ //如果已经获取到了,就直接执行函数 func && func.call(this); } else { _iIntervalID = setInterval(function() { if(!_times) { //是0就退出 clearInterval(_iIntervalID); } _times <= 0 || _times--; //如果是正数就 -- _self = $(_selector); //再次选择 if( _self.length ) { //判断是否取到 func && func.call(_self); clearInterval(_iIntervalID); } }, _interval); } return this; } $("table.syntaxhighlighter").wait(function() { $('table.syntaxhighlighter').append("<p class='cnblogs_code_footer'><span class='cnblogs_code_footer_icon'></span></p>"); }); $(document).on("click", ".cnblogs_code_footer",function(){ $(this).parents('table.syntaxhighlighter').css('display','inline-table');$(this).hide(); }); $('.nphpQianCont').viewer({navbar:true,title:false,toolbar:false,movable:false,viewed:function(){$('img').click(function(){$('.viewer-close').trigger('click');});}}); </script></body><!-- Matomo --><script> var _paq = window._paq = window._paq || []; /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ _paq.push(['trackPageView']); _paq.push(['enableLinkTracking']); (function() { var u="https://tongji.php.cn/"; _paq.push(['setTrackerUrl', u+'matomo.php']); _paq.push(['setSiteId', '9']); var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); })(); </script><!-- End Matomo Code --></html>