CURL可谓居家旅行必备之杀人良药,为何如此形容?就是因为他好用方便能实现页面抓取模拟登录采集等一系列功能。
记得第一次接触CURL的时候是要实现完成从邮箱用户列表的抓取。当时为了赶进度没有细细研究只是网上找了一些资料实现了功能。现在把当初的代码整理一下功能依旧能用
复制代码 代码如下:
error_reporting ( 0 );
set_time_limit ( 0 );
header ( "Content-Type: text/html; charset=GB2312" );
//邮箱用户名密码
$user = 'username';
$pass = 'password';
//创建一个文件用于存放cookie信息
define ( "COOKIEJAR", tempnam ( ini_get ( "upload_tmp_dir" ), "cookie" ) );
$url = 'http://reg.163.com/logins.jsp?type=1&url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight%3D1%26verifycookie%3D1%26language%3D-1%26style%3D-1';
$refer = 'http://mail.163.com';
$fields_post = array ('username' => $user, 'password' => $pass, 'verifycookie' => 1, 'style' => - 1, 'product' => 'mail163', 'selType' => - 1, 'secure' => 'on' );
$fields_string = http_build_query ( $fields_post, '&' );
$headers_login = array ('User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0', 'Referer' => 'http://www.163.com' );
//登录
$ch = curl_init ( $url );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_HEADER, true );
curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 120 );
curl_setopt ( $ch, CURLOPT_POST, true );
curl_setopt ( $ch, CURLOPT_REFERER, $refer );
curl_setopt ( $ch, CURLOPT_COOKIESESSION, true );
curl_setopt ( $ch, CURLOPT_COOKIEJAR, COOKIEJAR );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $headers_login );
curl_setopt ( $ch, CURLOPT_POST, count ( $fields ) );
curl_setopt ( $ch, CURLOPT_POSTFIELDS, $fields_string );
$result = curl_exec ( $ch );
curl_close ( $ch );
//跳转
$url = 'http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight=1&verifycookie=1&language=-1&style=-1&username=loki_wuxi';
$headers = array ('User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0' );
$ch = curl_init ( $url );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_HEADER, true );
curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 120 );
curl_setopt ( $ch, CURLOPT_POST, true );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $headers );
curl_setopt ( $ch, CURLOPT_COOKIEFILE, COOKIEJAR );
curl_setopt ( $ch, CURLOPT_COOKIEJAR, COOKIEJAR );
$result = curl_exec ( $ch );
curl_close ( $ch );
//取得sid
preg_match ( '/sid=[^\"].*/', $result, $location );
$sid = substr ( $location [0], 4, - 1 );
//通讯录地址
$url = 'http://g4a30.mail.163.com/jy3/address/addrlist.jsp?sid=' . $sid . '&gid=all';
$headers = array ('User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/2008052906 Firefox/3.0' );
$ch = curl_init ( $url );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt ( $ch, CURLOPT_HEADER, true );
curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 120 );
curl_setopt ( $ch, CURLOPT_POST, true );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $headers );
curl_setopt ( $ch, CURLOPT_COOKIEFILE, COOKIEJAR );
curl_setopt ( $ch, CURLOPT_COOKIEJAR, COOKIEJAR );
$result = curl_exec ( $ch );
curl_close ( $ch );
unlink ( COOKIEJAR );
//开始抓取内容
preg_match_all ( '/