Home > Article > Backend Development > 自定义HTTP抓包跟过滤
自定义HTTP抓包和过滤
定义一个http抓包类,发送数据到一个自定义的接受脚本,可以发送成功,并收取数据,但是发送到外网,却不行,分析过在浏览器下发送HTTP请求时的request header 信息,通过模拟请求,但超时...
<br />//定义一个HTTP抓包类,其实也可以用curl。。。。。<br /><br /><?php <br />ini_set('error_reporting', E_ALL);<br />class Httpwrap<br />{<br /> private $hostInfo=null;<br /> <br /> private $requestLine=null;<br /> private $requestHeader=null;<br /> private $emptyLine="\r\n";<br /> private $requestBody=null;<br /> private $requestEntity=null;<br /> <br /> private $responseEntity=null;<br /> private $responseHeader=null;<br /> private $responseBody=null;<br /> private $emptyLinePos=null;<br /> <br /> private $connect=null;<br /> private $errNo=null;<br /> private $errStr=null;<br /> <br /> <br /> public function __construct($url)<br /> {<br /> $this->hostInfo=parse_url($url);<br /> $this->setRequestHeader(array('Host' => $this->hostInfo['host']));<br /> $this->setRequestHeader(array('Connection' => 'keep-alive'));<br /> }<br /> //设置HTTP请求行信息,例如: GET /resources HTTP/1.1<br /> //但为了避免漏掉url中?开始的查询信息,有必要进行判断<br /> public function setRequestLine($method)<br /> {<br /> //如果是POST请求,则自动添加content-type头信息<br /> if(strtolower($method)=='post')<br /> {<br /> $this->setRequestHeader(array('Content-type' => 'application/x-www-form-urlencoded'));<br /> }<br /> if(!empty($this->hostInfo['query']))<br /> {<br /> $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']."?".$this->hostInfo['query']." HTTP/1.1 \r\n";<br /> }<br /> else<br /> {<br /> $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']." HTTP/1.1 \r\n";<br /> }<br /> }<br /> //设置HTTP请求头。<br /> //接收参数是数组类型,通过迭代拼接key:value,并换行<br /> public function setRequestHeader($header)<br /> {<br /> foreach($header as $key => $value)<br /> {<br /> $this->requestHeader .=$key.":".$value."\r\n";<br /> }<br /> }<br /> //设置HTTP请求体<br /> //接收参数是数组类型,通过迭代拼接key=value,因为最后一席拼接会有一个多余的&,所以有必要去掉<br /> public function setRequestBody($body)<br /> {<br /> foreach($body as $key => $value)<br /> {<br /> $this->requestBody .=$key.'='.$value.'&';<br /> }<br /> $offset=strrpos($this->requestBody, '&');<br /> $this->requestBody=substr($this->requestBody, 0, $offset);<br /> }<br /> //组装 请求行+请求头+请求体,并根据请求体的长度,自动填充请求头的content-length字段<br /> public function setRequestEntity()<br /> {<br /> if(!empty($this->requestBody))<br /> {<br /> $contentLength=strlen($this->requestBody);<br /> $this->setRequestHeader(array('Content-length' => $contentLength));<br /> <br /> $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n".$this->requestBody;<br /> }<br /> else<br /> {<br /> $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n";<br /> }<br /> }<br /> //解析主机名的函数,暂时没有用上.......<br /> public function parseHost($url)<br /> {<br /> $pat='#http://([^/]+)#i';<br /> if(preg_match($pat, $url, $match))<br /> {<br /> return $match[1];<br /> }<br /> else<br /> {<br /> echo '匹配主机信息失败<br />';<br /> }<br /> }<br /> //创建到主机的连接<br /> public function createConnect()<br /> {<br /> $this->connect=fsockopen($this->hostInfo['host'], 80, $this->errNo, $this->errStr) or die('连接主机失败'.$this->errStr);<br /> }<br /> //发送请求<br /> public function sendRequest()<br /> {<br /> $this->setRequestEntity();<br /> echo $this->requestEntity;<br /> exit();<br /> $this->createConnect();<br /> $entityLength=strlen($this->requestEntity);<br /> if($entityLength != fwrite($this->connect, $this->requestEntity, $entityLength))<br /> {<br /> die('写入数据失败<br />');<br /> }<br /> else<br /> {<br /> $this->receiveResponse();<br /> }<br /> }<br /> //接受请求,并依次拼接响应体<br /> public function receiveResponse()<br /> {<br /> while(!feof($this->connect))<br /> {<br /> $this->responseEntity .= fread($this->connect, 1024);<br /> }<br /> }<br /> //计算响应头与响应体之间的空行的位置<br /> public function calculateEmptyLinePos()<br /> {<br /> $this->emptyLinePos=strpos($this->responseEntity,"\r\n\r\n",0);<br /> }<br /> //接受响应体的头部....<br /> public function receiveResponseHeader()<br /> {<br /> $this->calculateEmptyLinePos();<br /> $this->responseHeader=substr($this->responseEntity, 0, $this->emptyLinePos);<br /> echo $this->responseHeader;<br /> }<br /> //接收响应体的body部分<br /> public function receiveResponseBody()<br /> {<br /> $this->calculateEmptyLinePos();<br /> $this->responseBody=substr($this->responseEntity, $this->emptyLinePos);<br /> }<br /> //返回请求结果<br /> public function getResponse()<br /> {<br /> return $this->responseEntity;<br /> }<br /> <br /><br /> public function parseResponse()<br /> {}<br /> public function __destruct()<br /> {<br /> //fclose($this->connect);<br /> } <br />}<br />set_time_limit(60);<br />$http=new Httpwrap("http://www.mmkao.com/Beautyleg/");<br />//设置HTTP请求行<br />$http->setRequestLine("get");<br />//设置HTTP头<br />$http->setRequestHeader(array("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));<br />$http->setRequestHeader(array("Accept-Language" => "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"));<br />$http->setRequestHeader(array("Accept-Encoding" => "gzip, deflate"));<br />$http->setRequestHeader(array("User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36"));<br />//$http->setRequestHeader(array("Cookie" => "BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=6; AJSTAT_ok_pages=2; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712"));<br />//发送数据<br />$http->sendRequest();<br />//$http->receiveResponseHeader();<br /><br />?><br /><br />通过这个类给领一个自定义的脚本,可以发送和接收数据,领一个脚本如下:<br /><br /><?php <br />if(!empty($_POST))<br />{<br /> $str=implode(',',$_POST);<br /> file_put_contents('./post.txt', $str,FILE_APPEND);<br /> echo $str;<br />}<br /><br />?><br />但是给这个网站发送请求时,却超时:网站是:<br />http://www.mmkao.com/Beautyleg/<br />通过chrome给这个网站首页发送请求时的header头信息:<br />Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8<br />Accept-Encoding:gzip,deflate,sdch<br />Accept-Language:zh,en;q=0.8,zh-TW;q=0.6,zh-CN;q=0.4,ja;q=0.2<br />Cache-Control:max-age=0<br />Connection:keep-alive<br />Cookie:BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=7; AJSTAT_ok_pages=3; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712<br />DNT:1<br />Host:www.mmkao.com<br />User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36<br />Response Headersview source<br /><br />//通过相同的包装,并调用Httpwrap发送请求时,却提示超时,是在不知道哪里出问题........<br /><br />针对这个网站写了一个过滤出图片链接的类:<br /><br /><?php <br />class Parseimage<br />{<br /> private $responseBody=null;<br /> private $imgLink=null;<br /> private $pageNum=null;<br /> private header=null;<br /> private body=null;<br /> <br /> public function __construct($body)<br /> {<br /> $this->responseBody=$body;<br /> }<br /> //匹配图片src开始的链接地址<br /> public function feedImage()<br /> {<br /> $pat='#<img (.*?)src="(.*?)(?=")#i';<br / alt="自定义HTTP抓包跟过滤" > if(preg_match_all($pat, $body, $match))<br /> {<br /> foreach($match[2] as $link)<br /> {<br /> $this->imgLink[]=$link;<br /> }<br /> }<br /> else<br /> {<br /> echo '匹配失败图片链接地址失败'."<br />";<br /> }<br /> }<br /> //提取head部分<br /> public function filterHeader($body)<br /> {<br /> $pat='#<head>[\s\S]+</head>#im';<br /> if(preg_match($pat, $body, $match))<br /> {<br /> $this->header=$match[0];<br /> }<br /> else<br /> {<br /> echo '匹配head部分失败'."<br />";<br /> }<br /> }<br /> //提取body部分<br /> public function filterBody($body)<br /> {<br /> $pat='#<body>[\s\S]+</body>#im';<br /> if(preg_match($pat, $body, $match))<br /> {<br /> $this->body=$match[0];<br /> }<br /> else<br /> {<br /> echo '匹配body部分失败'."<br />";<br /> }<br /> }<br /> //提取分页信息,这个只能针对性的匹配,不能通用<br /> public function rollPage($body)<br /> {<br /> $pat='#[\x{4e00}-\x{9fa5}]+\s*\d\s+?/\s+?\d+\s*[\x{4e00}-\x{9fa5}]*#ui';<br /> if(preg_match($pat, $body, $match))<br /> {<br /> $patNum='#/\s*(\d\d*)#';<br /> if(preg_match($patNum, $match[0], $num))<br /> {<br /> $this->pageNum=$num[1];<br /> }<br /> else<br /> {<br /> echo '提取分页具体值失败'."<br />";<br /> }<br /> }<br /> else<br /> {<br /> echo '提取分页统计失败'."<br />";<br /> }<br /> }<br /><br />?><br /><br /><br /><br />附注: 这两个类,,都通过了内网的测试,并成功过滤出图片的链接,但是给http://www.mmkao.com/Beautyleg/发送请求时,却提示超时,,不知道哪里出了问题。。。。。。<br /><br /><br /><br /><br />
$url = 'http://www.mmkao.com/Beautyleg/';<br />print_r(get_headers($url));
Array<br />(<br /> [0] => HTTP/1.1 200 OK<br /> [1] => Connection: close<br /> [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT<br /> [3] => Content-Length: 13889<br /> [4] => Content-Type: text/html<br /> [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html<br /> [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT<br /> [7] => Accept-Ranges: bytes<br /> [8] => ETag: "e8939ad2baf8cf1:693"<br /> [9] => Server: IIS<br /> [10] => X-Powered-By: WAF/2.0<br /> [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/<br />)<br /><br />