Home >Backend Development >PHP Tutorial >基于Snoopy的PHP近似完美获取网站编码的代码_php实例

基于Snoopy的PHP近似完美获取网站编码的代码_php实例

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOriginal: 2016-05-17 09:15:021141browse

先要到网上下载Snoopy.class.php
调用方法：

 
require 'lib/Snoopy.class.php'; 
require 'lib/WebCrawl.class.php';//包含下面代码 
$go=new WebCrawl('http://www.baidu.com'); 
echo $go->getCharset(); 
?> 

复制代码代码如下:

 
class WebCrawl 
{ 
private $url; 
private $request; 
public $charset_arr=array( 
'gb2312', 
'utf-8', 
'big5', 
'gbk', 
'ascii', 
'cp936', 
'ibm037', 
'ibm437', 
'ibm500', 
'asmo-708', 
'dos-720', 
'ibm737', 
'ibm775', 
'ibm850', 
'ibm852', 
'ibm855', 
'ibm857', 
'ibm00858', 
'ibm861', 
'ibm860', 
'dos-862', 
'ibm863', 
'ibm864', 
'ibm865', 
'cp866', 
'ibm869', 
'ibm870', 
'windows-874', 
'cp875', 
'shift_jis', 
'ks_c_5601-1987', 
'ibm1026', 
'ibm01047', 
'ibm01047', 
'ibm01040', 
'ibm01041', 
'ibm01042', 
'ibm01043', 
'ibm01044', 
'ibm01045', 
'ibm01046', 
'ibm01047', 
'ibm01048', 
'ibm01049', 
'utf-16', 
'unicodefffe', 
'windows-1250', 
'windows-1251', 
'windows-1252', 
'windows-1253', 
'windows-1254', 
'windows-1255', 
'windows-1256', 
'windows-1257', 
'windows-1258', 
'johab', 
'macintosh', 
'x-mac-japanese', 
'x-mac-chinesetrad', 
'x-mac-korean', 
'x-mac-arabic', 
'x-mac-hebrew', 
'x-mac-greek', 
'x-mac-cyrillic', 
'x-mac-chinesesimp', 
'x-mac-romanian', 
'x-mac-ukrainian', 
'x-mac-thai', 
'x-mac-ce', 
'x-mac-icelandic', 
'x-mac-turkish', 
'x-mac-croatian', 
'x-chinese-cns', 
'x-cp20001', 
'x-chinese-eten', 
'x-cp20003', 
'x-cp20004', 
'x-cp20005', 
'x-ia5', 
'x-ia5-german', 
'x-ia5-swedish', 
'x-ia5-norwegian', 
'us-ascii', 
'x-cp20261', 
'x-cp20269', 
'ibm273', 
'ibm277', 
'ibm278', 
'ibm280', 
'ibm284', 
'ibm285', 
'ibm290', 
'ibm420', 
'ibm423', 
'ibm424', 
'x-ebcdic-koreanextended', 
'ibm-thai', 
'koi8-r', 
'ibm871', 
'ibm880', 
'ibm905', 
'ibm00924', 
'x-cp20936', 
'x-cp20949', 
'cp1025', 
'koi8-u', 
'iso-8859-1', 
'iso-8859-2', 
'iso-8859-3', 
'iso-8859-4', 
'iso-8859-5', 
'iso-8859-6', 
'iso-8859-7', 
'iso-8859-8', 
'iso-8859-9', 
'iso-8859-13', 
'iso-8859-15', 
'x-europa', 
'iso-8859-8-i', 
'iso-2022-jp', 
'csiso2022jp', 
'iso-2022-jp', 
'iso-2022-kr', 
'x-cp50227', 
'euc-jp', 
'euc-cn', 
'euc-kr', 
'hz-gb-2312', 
'gb18030', 
'x-iscii-de', 
'x-iscii-be', 
'x-iscii-ta', 
'x-iscii-te', 
'x-iscii-as', 
'x-iscii-or', 
'x-iscii-ka', 
'x-iscii-ma', 
'x-iscii-gu', 
'x-iscii-pa', 
'utf-7', 
'utf-32', 
'utf-32be' 
); 
public function __construct($url) 
{ 
$this->url=$url; 
} 
//打开网站 
private function open($url) 
{ 
if($this->request!==null) 
{ 
if($this->request->status==200) 
{ 
return true; 
} 
else 
{ 
return false; 
} 
} 
else 
{ 
$this->request=new Snoopy(); 
$this->request->fetch($url); 
if($this->request->status==200) 
{ 
$this->request->results=strtolower($this->request->results); 
$charset=$this->getCharset(); 
if($charset!="utf-8") 
{ 
if($charset=="windows-1252") 
{ 
$this->request->results=$this->uni_decode($this->request->results); 
} 
else 
{ 
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); 
} 
} 
return true; 
} 
else 
{ 
return false; 
} 
} 
} 
//获取网站title,keywords,description 
public function getWebinfo() 
{ 
$info=array( 
'title'=>'', 
'keywords'=>'', 
'desc'=>'', 
'ip'=>'' 
); 
if(!$this->open($this->url)){return $info;exit;} 
// print_r($this->request->results);exit; 
preg_match('/([^>]*)/si', $this->request->results, $titlematch ); <br>if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) <br>{ <br>$info['title'] = strip_tags($titlematch[1]); <br>} <br>preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>$ft=0; <br>foreach($match[1] as $mt) <br>{ <br>if($mt=="keywords" || $mt=="description") <br>{ <br>$ft=1; <br>} <br>} <br>if($ft==0) <br>{ <br>preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[2]; <br>$values = $match[1]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>else <br>{ <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[1]; <br>$values = $match[2]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>$result = array ( <br>'metaTags' => $metaTags <br>); <br>if(isset($result['metaTags']['keywords']['value'])) <br>{ <br>$info['keywords']=$result['metaTags']['keywords']['value']; <br>} <br>else <br>{ <br>$info['keywords']=""; <br>} <br>if(isset($result['metaTags']['description']['value'])) <br>{ <br>$info['desc']=$result['metaTags']['description']['value']; <br>} <br>else <br>{ <br>$info['desc']=""; <br>} <br>$domain=preg_replace('/http\:\/\//si', '', $this->url); <br>$ip=@gethostbyname($domain); <br>$ip_arr=explode(".", $ip); <br>if(count($ip_arr)==4) <br>{ <br>$info['ip']=$ip; <br>} <br>return $info; <br>} <br>public function t($string,$o) <br>{ <br>for($i=0;$i<strlen>{ <br>if(ord($string{$i})continue; <br>if((ord($string{$i})&224)==224) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>$encoding = "UTF-8"; <br>break; <br>} <br>} <br>} <br>if((ord($string{$i})&192)==192) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$encoding = "GB2312"; <br>break; <br>} <br>} <br>} <br>return strtolower($encoding); <br>} <br>function uni_decode ($str, $code = 'utf-8'){ <br>$str = json_decode(preg_replace_callback('/(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); <br>if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } <br>return $str; <br>} <br>//获取网站编码 <br>public function getCharset() <br>{ <br>if(!$this->open($this->url)){return false;exit;} <br>//首先从html获取编码 <br>preg_match("/<meta.>request->results,$temp) ? strtolower($temp[1]):""; <br>if($temp[1]!="") <br>{ <br>if(in_array($temp[1], $this->charset_arr)) <br>{ <br>if($temp[1]=="gb2312") <br>{ <br>$tmp_charset=$this->t($this->request->results,$temp[1]); <br>if($tmp_charset==$temp[1]) <br>{ <br>return $temp[1]; <br>} <br>} <br>else <br>{ <br>return $temp[1]; <br>} <br>} <br>} <br>if(!empty($this->request->headers)) <br>{ <br>//从header中获取编码 <br>$hstr=strtolower(implode("|||",$this->request->headers)); <br>preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; <br>if($lang[1]!="") <br>{ <br>return $lang[1]; <br>} <br>} <br>$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); <br>$encoded=mb_detect_encoding($this->request->results,$encode_arr); <br>if($encoded) <br>{ <br>return strtolower($encoded); <br>} <br>else <br>{ <br>return false; <br>} <br>} <br>} <br>?> <br></meta.></strlen>

Statement：

The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn

Previous article：php数组函数序列之array_sum() - 计算数组元素值之和_php技巧Next article：php数组函数序列之array_key_exists() - 查找数组键名是否存在_php技巧

See more

基于Snoopy的PHP近似完美获取网站编码的代码_php实例

Related articles