Maison >développement back-end >tutoriel php >PHP实现的一个简单的爬虫
这个小爬虫的功能是抓取目标网页的url,并实现递归爬。这个小demo是参照网友的代码然后自己改了一下,由于网上版本太多,我就不@原来的作者了(我不知道谁才是真正的作者)
下面是代码:
<code><span><?php </span><span>//爬虫类</span><span><span>class</span><span>Crawler</span>{</span><span>private</span><span>$url</span>; <span>public</span><span><span>function</span><span>__construct</span><span>(<span>$url</span>)</span>{</span><span>if</span>(!preg_match(<span>"/^(http)s?/"</span>, <span>$url</span>)){ <span>$url</span> = <span>"http://"</span>.<span>$url</span>; } <span>$this</span>->url = <span>$url</span>; } <span>//从给定的url中获取html内容</span><span>protected</span><span><span>function</span><span>_getUrlContent</span><span>(<span>$url</span>)</span>{</span> @<span>$handle</span> = fopen(<span>$url</span>, <span>"r"</span>); <span>if</span>(error_get_last()){<span>//捕获异常(不一定是错误)</span><span>$err</span> = <span>new</span><span>Exception</span>(<span>"你的URL好像不对!要不换一个?"</span>); <span>echo</span><span>$err</span>->getMessage(); <span>return</span>; } <span>if</span>(<span>$handle</span>){ <span>$content</span> = stream_get_contents(<span>$handle</span>,<span>1024</span>*<span>1024</span>);<span>//将资源流读入字符串</span><span>return</span><span>$content</span>; }<span>else</span>{ <span>return</span><span>false</span>; } } <span>//从html内容中筛选链接</span><span>protected</span><span><span>function</span><span>_filterUrl</span><span>(<span>$web_content</span>)</span>{</span><span>$reg_tag_a</span> = <span>'/\'\"\ ]*).*?>/'</span>; <span>$result</span> = preg_match_all(<span>$reg_tag_a</span>,<span>$web_content</span>,<span>$match_result</span>); <span>if</span>(<span>$result</span>){ <span>return</span><span>$match_result</span>[<span>1</span>]; } } <span>//判断是否是完整的url</span><span>protected</span><span><span>function</span><span>_judgeURL</span><span>(<span>$url</span>)</span>{</span><span>$url_info</span> = parse_url(<span>$url</span>); <span>if</span>(<span>isset</span>(<span>$url_info</span>[<span>'scheme'</span>])||<span>isset</span>(<span>$url_info</span>[<span>'host'</span>])){ <span>return</span><span>true</span>; } <span>return</span><span>false</span>; } <span>//修正相对路径</span><span>protected</span><span><span>function</span><span>_reviseUrl</span><span>(<span>$base_url</span>,<span>$url_list</span>)</span>{</span><span>$url_info</span> = parse_url(<span>$base_url</span>);<span>//分解url中的各个部分</span><span>unset</span>(<span>$base_url</span>); <span>$base_url</span> = <span>isset</span>(<span>$url_info</span>[<span>"scheme"</span>])?<span>$url_info</span>[<span>"scheme"</span>].<span>'://'</span>:<span>""</span>;<span>//$url_info["scheme"]为http、ftp等</span><span>if</span>(<span>isset</span>(<span>$url_info</span>[<span>"user"</span>]) && <span>isset</span>(<span>$url_info</span>[<span>"pass"</span>])){<span>//记录用户名及密码的url</span><span>$base_url</span> .= <span>$url_info</span>[<span>"user"</span>].<span>":"</span>.<span>$url_info</span>[<span>"pass"</span>].<span>"@"</span>; } <span>$base_url</span> .= <span>isset</span>(<span>$url_info</span>[<span>"host"</span>])?<span>$url_info</span>[<span>"host"</span>]:<span>""</span>;<span>//$url_info["host"]域名</span><span>if</span>(<span>isset</span>(<span>$url_info</span>[<span>"port"</span>])){<span>//$url_info["port"]端口,8080等</span><span>$base_url</span> .= <span>":"</span>.<span>$url_info</span>[<span>"port"</span>]; } <span>$base_url</span> .= <span>isset</span>(<span>$url_info</span>[<span>"path"</span>])?<span>$url_info</span>[<span>"path"</span>]:<span>""</span>;<span>//$url_info["path"]路径</span><span>//目前为止,绝对路径前面已经组装完</span><span>if</span>(is_array(<span>$url_list</span>)){ <span>foreach</span> (<span>$url_list</span><span>as</span><span>$url_item</span>) { <span>// if(preg_match('/^(http)s?/',$url_item)){</span><span>if</span>(<span>$this</span>->_judgeURL(<span>$url_item</span>)){ <span>//已经是完整的url</span><span>$result</span>[] = <span>$url_item</span>; }<span>else</span> { <span>//不完整的url</span><span>$real_url</span> = <span>$base_url</span>.<span>$url_item</span>; <span>$result</span>[] = <span>$real_url</span>; } } <span>return</span><span>$result</span>; }<span>else</span> { <span>return</span>; } } <span>//爬虫</span><span>public</span><span><span>function</span><span>crawler</span><span>()</span>{</span><span>$content</span> = <span>$this</span>->_getUrlContent(<span>$this</span>->url); <span>if</span>(<span>$content</span>){ <span>$url_list</span> = <span>$this</span>->_reviseUrl(<span>$this</span>->url,<span>$this</span>->_filterUrl(<span>$content</span>)); <span>if</span>(<span>$url_list</span>){ <span>return</span><span>$url_list</span>; }<span>else</span> { <span>return</span> ; } }<span>else</span>{ <span>return</span> ; } } } <span>$fp_puts</span> = fopen(<span>"url.txt"</span>,<span>"ab"</span>);<span>//记录url列表</span><span>$fp_gets</span> = fopen(<span>"url.txt"</span>,<span>"r"</span>);<span>//保存url列表</span><span>$current_url</span> = <span>"www.baidu.com"</span>; <span>do</span>{ <span>$Crawler</span> = <span>new</span> Crawler(<span>$current_url</span>); <span>$url_arr</span> = <span>$Crawler</span>->crawler(); <span>if</span>(<span>$url_arr</span>){ <span>foreach</span> (<span>$url_arr</span><span>as</span><span>$url</span>) { fputs(<span>$fp_puts</span>,<span>$url</span>.<span>"\n"</span>); } } }<span>while</span> (<span>$current_url</span> = fgets(<span>$fp_gets</span>,<span>1024</span>));<span>//不断获得url</span><span>// echo "<pre class="brush:php;toolbar:false">";// var_dump($url_arr);// echo "";?>
由于在循环的时候要new的对象可能会很多,当时想的是用单例模式解决,以免内存开销太大,后来嫌麻烦就不了了之了。。。。
').addClass('pre-numbering').hide(); $(this).addClass('has-numbering').parent().append($numbering); for (i = 1; i ').text(i)); }; $numbering.fadeIn(1700); }); });以上就介绍了PHP实现的一个简单的爬虫,包括了方面的内容,希望对PHP教程有兴趣的朋友有所帮助。