本帖最后由 qq_25293153 于 2015-01-13 15:42:18 编辑 file_get_contents 采集一个页面的数据,获取的数据是乱码,已经使用了检测编码的方式,
检测的是utf-8,我的页面编码也是utf-8,但是还是显示乱码,不知道为什么
<br /><br />$url="xxx";<br />$opts = array( <br /> 'http'=>array( <br /> 'user_agent' => "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)",<br /> ) <br />); <br />$context = stream_context_create($opts); <br />$neirong = file_get_contents($url, false, $context);<br /><br /> header("content-Type: text/html; charset=Utf-8"); <br /> ob_end_flush();<br /> $encode = mb_detect_encoding($neirong, array("ASCII","UTF-8","GB2312","GBK","BIG5")); <br /> <br /> echo $encode."<br>";<br /> <br /> if ($encode!="UTF-8")<br /> {<br /> $neirong=mb_convert_encoding($neirong, "UTF-8", $encode); <br /> <br /> } <br /> <br /> <br /><br />echo $neirong;<br /><br /><br /><br />
$encode 输出:utf-8
$neirong 输出是乱码
我的页面编码是utf-8
------解决思路----------------------你在輸出的html中加入
它源數據是做了些轉換的,我那個程序已經是轉換過來了。
我把採集的也寫出來,直接運行就可以了。
<br /><?php <br />//http://www.ziyouge.com/conbdhekbefiab<br />//http://www.ziyouge.com/zy/4/4980/1333249.html<br /><br />// 獲取程序<br /><br />$url = 'http://www.ziyouge.com/conbdhekbefiab';<br /><br />$headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36';<br />$headerArr = array(); <br />foreach( $headers as $n => $v ) { <br /> $headerArr[] = $n .':' . $v; <br />}<br /><br />$ch = curl_init();<br />curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); <br />curl_setopt($ch, CURLOPT_URL, $url);<br />curl_setopt($ch, CURLOPT_HTTPHEADER , $headerArr ); //构造IP<br />curl_setopt($ch, CURLOPT_REFERER, 'http://www.ziyouge.com/'); //构造来路<br />$content = curl_exec($ch);<br />$content = substr($content,3);<br />if($error=curl_error($ch)){<br /> die($error);<br />}<br />curl_close($ch);<br /><br />// 分析程序<br /><br />$result = '';<br />$str_length = mb_strlen($content);<br />$i=0;<br />while ($i<=$str_length)<br />{<br /> $temp_str=mb_substr($content,$i,1);<br /> $ascnum=Ord($temp_str);<br /> if ($ascnum>=224){<br /> $result .= change(mb_substr($content,$i,3));<br /> $i=$i+3;<br /> }else{<br /> $result .= mb_substr($content,$i,1);<br /> $i=$i+1;<br /> }<br />}<br /><br />echo '<meta http-equiv="content-type" content="text/html;charset=utf-8">';<br />echo $result;<br /><br />// 處理<br />function change($str){<br /><br /> $ignore = array('“','”','!','…',':',',',',');<br /><br /> if(in_array($str, $ignore)){<br /> return $str;<br /> }<br /><br /> $prefix = "%u";<br /> $postfix = "";<br /> $str = iconv('UTF-8', 'UCS-2', $str);<br /> $arrstr = str_split($str, 2);<br /> $unistr = '';<br /> for($i = 0, $len = count($arrstr); $i < $len; $i++) {<br /> $tmp = hexdec(bin2hex($arrstr[$i]));<br /> $tmp = str_pad(dechex($tmp),4,'0',STR_PAD_LEFT);<br /> $tmp = decrypt(substr($tmp,2,2).substr($tmp,0,2));<br /> $unistr .= $prefix . $tmp . $postfix;<br /> }<br /> return unescape($unistr);<br />}<br /><br />// 解密<br />function decrypt($d){<br /> $result = str_pad(dechex(hexdec($d)-100),4,'0',STR_PAD_LEFT);<br /> return $result;<br />}<br /><br />// 轉中文<br />function unescape($str) { <br /> $ret = ''; <br /> $len = strlen ( $str ); <br /> for($i = 0; $i < $len; $i ++) { <br /> if ($str [$i] == '%' && $str [$i + 1] == 'u') { <br /> $val = hexdec ( substr ( $str, $i + 2, 4 ) ); <br /> if ($val < 0x7f) <br /> $ret .= chr ( $val ); <br /> else if ($val < 0x800) <br /> $ret .= chr ( 0xc0 <br><font color='#FF8000'>------解决思路----------------------</font><br> ($val >> 6) ) . chr ( 0x80 <br><font color='#FF8000'>------解决思路----------------------</font><br> ($val & 0x3f) ); <br /> else <br /> $ret .= chr ( 0xe0 <br><font color='#FF8000'>------解决思路----------------------</font><br> ($val >> 12) ) . chr ( 0x80 <br><font color='#FF8000'>------解决思路----------------------</font><br> (($val >> 6) & 0x3f) ) . chr ( 0x80 <br><font color='#FF8000'>------解决思路----------------------</font><br> ($val & 0x3f) ); <br /> $i += 5; <br /> } else if ($str [$i] == '%') { <br /> $ret .= urldecode ( substr ( $str, $i, 3 ) ); <br /> $i += 2; <br /> } else <br /> $ret .= $str [$i]; <br /> } <br /> return $ret; <br />} <br />?> <br />