Home > Article > Backend Development > php抓取这个网页的数据,只要数据,不要html内容,然后json后写入文件,新手求教
http://www.okooo.com/Upload/sohu/table_23.html
新收求教啊,这个难度在于正则上,不会写正则啊
$url = 'http://www.okooo.com/Upload/sohu/table_23.html';$s = file_get_contents($url);preg_match_all('#<table.+</table>#isU', $s, $m);foreach(array_map('strip_tags', $m[0]) as $r) { $a = preg_split('/\s+/', $r, -1, PREG_SPLIT_NO_EMPTY); $res[] = array_chunk(array_slice($a, 0, -1), 3);}print_r($res);echo json_encode($res);
Array( [0] => Array ( [0] => Array ( [0] => 排名 [1] => 球队 [2] => 积分 ) [1] => Array ( [0] => 1 [1] => 尤文图斯 [2] => 102 ) [2] => Array ( [0] => 2 [1] => 罗马 [2] => 85 ) [3] => Array ( [0] => 3 [1] => 那不勒斯 [2] => 78 ) [4] => Array ( [0] => 4 [1] => 佛罗伦萨 [2] => 65 ) [5] => Array ( [0] => 5 [1] => 国际米兰 [2] => 60 ) [6] => Array ( [0] => 6 [1] => 帕尔马 [2] => 58 ) [7] => Array ( [0] => 7 [1] => 都灵 [2] => 57 ) [8] => Array ( [0] => 8 [1] => AC米兰 [2] => 57 ) [9] => Array ( [0] => 9 [1] => 拉齐奥 [2] => 56 ) [10] => Array ( [0] => 10 [1] => 维罗纳 [2] => 54 ) ) [1] => Array ( [0] => Array ( [0] => 进球数 [1] => 球员 [2] => 球队 ) [1] => Array ( [0] => 22 [1] => 伊莫比莱 [2] => 都灵 ) [2] => Array ( [0] => 20 [1] => 托尼 [2] => 维罗纳 ) [3] => Array ( [0] => 19 [1] => 特维斯 [2] => 尤文图斯 ) [4] => Array ( [0] => 17 [1] => 帕拉西奥 [2] => 国际米兰 ) [5] => Array ( [0] => 17 [1] => 伊瓜因 [2] => 那不勒斯 ) [6] => Array ( [0] => 16 [1] => 略伦特 [2] => 尤文图斯 ) [7] => Array ( [0] => 16 [1] => 迪纳塔莱 [2] => 乌迪内斯 ) [8] => Array ( [0] => 16 [1] => 朱塞佩·罗西 [2] => 佛罗伦萨 ) [9] => Array ( [0] => 16 [1] => 贝拉尔迪 [2] => 莎索罗 ) [10] => Array ( [0] => 15 [1] => 卡列洪 [2] => 那不勒斯 ) ))[[["\u6392\u540d","\u7403\u961f","\u79ef\u5206"],["1","\u5c24\u6587\u56fe\u65af","102"],["2","\u7f57\u9a6c","85"],["3","\u90a3\u4e0d\u52d2\u65af","78"],["4","\u4f5b\u7f57\u4f26\u8428","65"],["5","\u56fd\u9645\u7c73\u5170","60"],["6","\u5e15\u5c14\u9a6c","58"],["7","\u90fd\u7075","57"],["8","AC\u7c73\u5170","57"],["9","\u62c9\u9f50\u5965","56"],["10","\u7ef4\u7f57\u7eb3","54"]],[["\u8fdb\u7403\u6570","\u7403\u5458","\u7403\u961f"],["22","\u4f0a\u83ab\u6bd4\u83b1","\u90fd\u7075"],["20","\u6258\u5c3c","\u7ef4\u7f57\u7eb3"],["19","\u7279\u7ef4\u65af","\u5c24\u6587\u56fe\u65af"],["17","\u5e15\u62c9\u897f\u5965","\u56fd\u9645\u7c73\u5170"],["17","\u4f0a\u74dc\u56e0","\u90a3\u4e0d\u52d2\u65af"],["16","\u7565\u4f26\u7279","\u5c24\u6587\u56fe\u65af"],["16","\u8fea\u7eb3\u5854\u83b1","\u4e4c\u8fea\u5185\u65af"],["16","\u6731\u585e\u4f69\u00b7\u7f57\u897f","\u4f5b\u7f57\u4f26\u8428"],["16","\u8d1d\u62c9\u5c14\u8fea","\u838e\u7d22\u7f57"],["15","\u5361\u5217\u6d2a","\u90a3\u4e0d\u52d2\u65af"]]]写文件你就自己练练手吧
也不是非得用正则,试试用DOM处理,搜下Simple HTML DOM看