Heim >Backend-Entwicklung >PHP-Tutorial >基于Snoopy的PHP近似完美获取网站编码的代码_php实例

基于Snoopy的PHP近似完美获取网站编码的代码_php实例

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOriginal: 2016-05-17 09:15:021151Durchsuche

先要到网上下载Snoopy.class.php
调用方法：

 
require 'lib/Snoopy.class.php'; 
require 'lib/WebCrawl.class.php';//包含下面代码 
$go=new WebCrawl('http://www.baidu.com'); 
echo $go->getCharset(); 
?> 

复制代码代码如下:

 
class WebCrawl 
{ 
private $url; 
private $request; 
public $charset_arr=array( 
'gb2312', 
'utf-8', 
'big5', 
'gbk', 
'ascii', 
'cp936', 
'ibm037', 
'ibm437', 
'ibm500', 
'asmo-708', 
'dos-720', 
'ibm737', 
'ibm775', 
'ibm850', 
'ibm852', 
'ibm855', 
'ibm857', 
'ibm00858', 
'ibm861', 
'ibm860', 
'dos-862', 
'ibm863', 
'ibm864', 
'ibm865', 
'cp866', 
'ibm869', 
'ibm870', 
'windows-874', 
'cp875', 
'shift_jis', 
'ks_c_5601-1987', 
'ibm1026', 
'ibm01047', 
'ibm01047', 
'ibm01040', 
'ibm01041', 
'ibm01042', 
'ibm01043', 
'ibm01044', 
'ibm01045', 
'ibm01046', 
'ibm01047', 
'ibm01048', 
'ibm01049', 
'utf-16', 
'unicodefffe', 
'windows-1250', 
'windows-1251', 
'windows-1252', 
'windows-1253', 
'windows-1254', 
'windows-1255', 
'windows-1256', 
'windows-1257', 
'windows-1258', 
'johab', 
'macintosh', 
'x-mac-japanese', 
'x-mac-chinesetrad', 
'x-mac-korean', 
'x-mac-arabic', 
'x-mac-hebrew', 
'x-mac-greek', 
'x-mac-cyrillic', 
'x-mac-chinesesimp', 
'x-mac-romanian', 
'x-mac-ukrainian', 
'x-mac-thai', 
'x-mac-ce', 
'x-mac-icelandic', 
'x-mac-turkish', 
'x-mac-croatian', 
'x-chinese-cns', 
'x-cp20001', 
'x-chinese-eten', 
'x-cp20003', 
'x-cp20004', 
'x-cp20005', 
'x-ia5', 
'x-ia5-german', 
'x-ia5-swedish', 
'x-ia5-norwegian', 
'us-ascii', 
'x-cp20261', 
'x-cp20269', 
'ibm273', 
'ibm277', 
'ibm278', 
'ibm280', 
'ibm284', 
'ibm285', 
'ibm290', 
'ibm420', 
'ibm423', 
'ibm424', 
'x-ebcdic-koreanextended', 
'ibm-thai', 
'koi8-r', 
'ibm871', 
'ibm880', 
'ibm905', 
'ibm00924', 
'x-cp20936', 
'x-cp20949', 
'cp1025', 
'koi8-u', 
'iso-8859-1', 
'iso-8859-2', 
'iso-8859-3', 
'iso-8859-4', 
'iso-8859-5', 
'iso-8859-6', 
'iso-8859-7', 
'iso-8859-8', 
'iso-8859-9', 
'iso-8859-13', 
'iso-8859-15', 
'x-europa', 
'iso-8859-8-i', 
'iso-2022-jp', 
'csiso2022jp', 
'iso-2022-jp', 
'iso-2022-kr', 
'x-cp50227', 
'euc-jp', 
'euc-cn', 
'euc-kr', 
'hz-gb-2312', 
'gb18030', 
'x-iscii-de', 
'x-iscii-be', 
'x-iscii-ta', 
'x-iscii-te', 
'x-iscii-as', 
'x-iscii-or', 
'x-iscii-ka', 
'x-iscii-ma', 
'x-iscii-gu', 
'x-iscii-pa', 
'utf-7', 
'utf-32', 
'utf-32be' 
); 
public function __construct($url) 
{ 
$this->url=$url; 
} 
//打开网站 
private function open($url) 
{ 
if($this->request!==null) 
{ 
if($this->request->status==200) 
{ 
return true; 
} 
else 
{ 
return false; 
} 
} 
else 
{ 
$this->request=new Snoopy(); 
$this->request->fetch($url); 
if($this->request->status==200) 
{ 
$this->request->results=strtolower($this->request->results); 
$charset=$this->getCharset(); 
if($charset!="utf-8") 
{ 
if($charset=="windows-1252") 
{ 
$this->request->results=$this->uni_decode($this->request->results); 
} 
else 
{ 
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); 
} 
} 
return true; 
} 
else 
{ 
return false; 
} 
} 
} 
//获取网站title,keywords,description 
public function getWebinfo() 
{ 
$info=array( 
'title'=>'', 
'keywords'=>'', 
'desc'=>'', 
'ip'=>'' 
); 
if(!$this->open($this->url)){return $info;exit;} 
// print_r($this->request->results);exit; 
preg_match('/([^>]*)/si', $this->request->results, $titlematch ); <br>if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) <br>{ <br>$info['title'] = strip_tags($titlematch[1]); <br>} <br>preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>$ft=0; <br>foreach($match[1] as $mt) <br>{ <br>if($mt=="keywords" || $mt=="description") <br>{ <br>$ft=1; <br>} <br>} <br>if($ft==0) <br>{ <br>preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[2]; <br>$values = $match[1]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>else <br>{ <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[1]; <br>$values = $match[2]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>$result = array ( <br>'metaTags' => $metaTags <br>); <br>if(isset($result['metaTags']['keywords']['value'])) <br>{ <br>$info['keywords']=$result['metaTags']['keywords']['value']; <br>} <br>else <br>{ <br>$info['keywords']=""; <br>} <br>if(isset($result['metaTags']['description']['value'])) <br>{ <br>$info['desc']=$result['metaTags']['description']['value']; <br>} <br>else <br>{ <br>$info['desc']=""; <br>} <br>$domain=preg_replace('/http\:\/\//si', '', $this->url); <br>$ip=@gethostbyname($domain); <br>$ip_arr=explode(".", $ip); <br>if(count($ip_arr)==4) <br>{ <br>$info['ip']=$ip; <br>} <br>return $info; <br>} <br>public function t($string,$o) <br>{ <br>for($i=0;$i<strlen>{ <br>if(ord($string{$i})continue; <br>if((ord($string{$i})&224)==224) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>$encoding = "UTF-8"; <br>break; <br>} <br>} <br>} <br>if((ord($string{$i})&192)==192) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$encoding = "GB2312"; <br>break; <br>} <br>} <br>} <br>return strtolower($encoding); <br>} <br>function uni_decode ($str, $code = 'utf-8'){ <br>$str = json_decode(preg_replace_callback('/(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); <br>if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } <br>return $str; <br>} <br>//获取网站编码 <br>public function getCharset() <br>{ <br>if(!$this->open($this->url)){return false;exit;} <br>//首先从html获取编码 <br>preg_match("/<meta.>request->results,$temp) ? strtolower($temp[1]):""; <br>if($temp[1]!="") <br>{ <br>if(in_array($temp[1], $this->charset_arr)) <br>{ <br>if($temp[1]=="gb2312") <br>{ <br>$tmp_charset=$this->t($this->request->results,$temp[1]); <br>if($tmp_charset==$temp[1]) <br>{ <br>return $temp[1]; <br>} <br>} <br>else <br>{ <br>return $temp[1]; <br>} <br>} <br>} <br>if(!empty($this->request->headers)) <br>{ <br>//从header中获取编码 <br>$hstr=strtolower(implode("|||",$this->request->headers)); <br>preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; <br>if($lang[1]!="") <br>{ <br>return $lang[1]; <br>} <br>} <br>$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); <br>$encoded=mb_detect_encoding($this->request->results,$encode_arr); <br>if($encoded) <br>{ <br>return strtolower($encoded); <br>} <br>else <br>{ <br>return false; <br>} <br>} <br>} <br>?> <br></meta.></strlen>

Stellungnahme：

Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn

Vorheriger Artikel：php数组函数序列之array_sum() - 计算数组元素值之和_php技巧Nächster Artikel：php数组函数序列之array_key_exists() - 查找数组键名是否存在_php技巧

In Verbindung stehende Artikel

Mehr sehen