Heim  >  Artikel  >  Backend-Entwicklung  >  URL抓取工具


2016-07-25 08:48:111240Durchsuche

  1. error_reporting(E_ERROR | E_WARNING | E_PARSE);
  2. ini_set('memory_limit','1024M');
  3. set_time_limit(0);
  4. define('CHECK_A_TAG', false);
  5. define('CHECK_JS_TAG', true);
  6. define('CHECK_URL', true);
  7. define('SAVE_ERROR', true);
  8. $checkArr = array(
  9. '$.load',
  10. '.ajax',
  11. '$.post',
  12. '$.get',
  13. '.getJSON'
  14. );
  15. if ($argc die(showerror('sorry, parameter error', array('example: php debug.php url num filename header proxy', 'detail information:', 'url: target url address which you want to check it', 'num: The number of pages of recursive,default 3', 'filename: output filename default name ret.txt', 'header: The request header file default null', 'proxy: if you want to use proxy set it here default no use proxy')));
  16. if (!check_extension())
  17. die(showerror('extension curl not support', 'please open php curl extension support'));
  18. //global variable
  19. $url = trim($argv[1]);
  20. if (stripos($url, 'http') === false)
  21. $url = 'http://'.$url;
  22. $num = isset($argv[2]) ? intval($argv[2]) : 3;
  23. $output = isset($argv[3]) ? trim(str_replace("\\", '/', $argv[3])) : str_replace("\\", '/', dirname(__FILE__)).'/ret.txt';
  24. $header = null;
  25. $proxy = null;
  26. $host = null;
  27. if (isset($argv[4]))
  28. {
  29. $header = trim(str_replace("\\", '/', $argv[4]));
  30. if (file_exists($header))
  31. $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($header))));
  32. else
  33. {
  34. $file = str_replace("\\", '/', dirname(__FILE__)).'/'.$header;
  35. if (file_exists($file))
  36. $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($file))));
  37. else
  38. $header = null;
  39. }
  40. }
  41. if (isset($argv[5]))
  42. $proxy = trim($argv[5]);
  43. if (!is_array($header) || empty($header))
  44. $header = null;
  45. $result = check_valid_url($url);
  46. $outputArr = array();
  47. if (!empty($result))
  48. {
  49. $result = str_replace("\r", '', $result);
  50. $result = str_replace("\n", '', $result);
  51. $tmpArr = parse_url($url);
  52. if (!isset($tmpArr['host']))
  53. die(showerror('parse url error', 'can not get host form url: '.$url));
  54. $host = $tmpArr['host'];
  55. if (stripos($host, 'http') === false)
  56. $host = 'http://'.$host;
  57. unset($tmpArr);
  58. //check for current page
  59. if (!isset($outputArr[md5($url)]))
  60. {
  61. $outputArr[md5($url)] = $url;
  62. file_put_contents($output, $url."\n", FILE_APPEND);
  63. echo 'url: ',$url,' find ajax require so save it',PHP_EOL;
  64. }
  65. work($result);
  66. }
  67. echo 'run finish',PHP_EOL;
  68. function work($result, $reverse = false)
  69. {
  70. global $num, $host, $outputArr, $checkArr, $output;
  71. if (!$result)
  72. return;
  73. $result = str_replace("\r", '', $result);
  74. $result = str_replace("\n", '', $result);
  75. while ($num > 0)
  76. {
  77. echo 'remain: ',$num,' now start to check for url address',PHP_EOL,PHP_EOL;
  78. preg_match_all('//i', $result, $match);
  79. if (CHECK_A_TAG && isset($match[2]) && !empty($match[2]))
  80. {
  81. foreach ($match[2] as $mc)
  82. {
  83. $mc = trim($mc);
  84. if ($mc == '#')
  85. continue;
  86. if (stripos($mc, 'http') === false)
  87. $mc = $host.$mc;
  88. if (($ret = check_valid_url($mc)))
  89. {
  90. if (!isset($outputArr[md5($mc)]))
  91. {
  92. $outputArr[md5($mc)] = $mc;
  93. file_put_contents($output, $mc."\n", FILE_APPEND);
  94. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  95. }
  96. }
  97. }
  98. }
  99. //check for page url
  100. echo 'remain: ',$num,' now start to check for page url',PHP_EOL,PHP_EOL;
  101. preg_match_all('/(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]+\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}(\/.*)*\/?/i', $result, $match);
  102. if (CHECK_URL && isset($match[2]) && !empty($match[2]))
  103. {
  104. foreach ($match[2] as $mc)
  105. {
  106. $mc = trim($mc);
  107. if ($mc == '#')
  108. continue;
  109. if (stripos($mc, 'http') === false)
  110. $mc = $host.$mc;
  111. if (($ret = check_valid_url($mc)))
  112. {
  113. if (!isset($outputArr[md5($mc)]))
  114. {
  115. $outputArr[md5($mc)] = $mc;
  116. file_put_contents($output, $mc."\n", FILE_APPEND);
  117. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  118. }
  119. }
  120. }
  121. }
  122. //check for javascript ajax require
  123. echo 'remain: ',$num,' now start to check for javascript ajax require',PHP_EOL,PHP_EOL;
  124. preg_match_all('//i', $result, $match);
  125. if (CHECK_JS_TAG && isset($match[2]) && !empty($match[2]))
  126. {
  127. foreach ($match[2] as $mc)
  128. {
  129. $mc = trim($mc);
  130. if ($mc == '#')
  131. continue;
  132. if (stripos($mc, 'http') === false)
  133. $mc = $host.$mc;
  134. if (($ret = check_valid_url($mc)))
  135. {
  136. //check for current page
  137. foreach ($checkArr as $ck)
  138. {
  139. if (!isset($outputArr[md5($mc)]) && strpos($ret, $ck) !== false)
  140. {
  141. $outputArr[md5($mc)] = $mc;
  142. file_put_contents($output, $mc."\n", FILE_APPEND);
  143. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  144. break;
  145. }
  146. }
  147. }
  148. }
  149. }
  150. if ($reverse)
  151. return;
  152. //check for next page
  153. preg_match_all('//i', $result, $match);
  154. if (isset($match[2]) && !empty($match[2]))
  155. {
  156. echo 'check for next page, remain page counts: ',$num,PHP_EOL;
  157. foreach ($match[2] as $mc)
  158. {
  159. $mc = trim($mc);
  160. if ($mc == '#')
  161. continue;
  162. if (stripos($mc, 'http') === false)
  163. $mc = $host.$mc;
  164. echo 'check for next page: ',$mc,PHP_EOL;
  165. work(check_valid_url($mc), true);
  166. }
  167. }
  168. $num--;
  169. sleep(3);
  170. }
  171. }
  172. function check_valid_url($url)
  173. {
  174. if (stripos($url, 'http') === false)
  175. $url = 'http://'.$url;
  176. $ch = curl_init();
  177. curl_setopt($ch, CURLOPT_URL, $url);
  178. curl_setopt($ch, CURLOPT_HEADER, true);
  179. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  180. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  181. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
  182. if (!is_null($header))
  183. curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  184. if (!is_null($proxy))
  185. curl_setopt($ch, CURLOPT_PROXY, $proxy);
  186. $ret = curl_exec($ch);
  187. $errinfo = curl_error($ch);
  188. curl_close($ch);
  189. unset($ch);
  190. if (!empty($errinfo) || ((strpos($ret, '200 OK') === false) && (strpos($ret, '302 Moved') === false)) || strpos($ret, '114so.cn') !== false)
  191. {
  192. showerror('check url: '.$url. ' find some errors', array($errinfo, $ret));
  193. if (SAVE_ERROR)
  194. file_put_contents(dirname(__FILE__).'/error.txt', $url."\n", FILE_APPEND);
  195. return false;
  196. }
  197. return $ret;
  198. }
  199. function check_extension()
  200. {
  201. if (!function_exists('curl_init') || !extension_loaded('curl'))
  202. return false;
  203. return true;
  204. }
  205. function showerror($t, $c)
  206. {
  207. $str = "#########################################################################\n";
  208. $str .= "# ".$t."\n";
  209. if (is_string($c))
  210. $str .= "# ".$c;
  211. elseif (is_array($c) && !empty($c))
  212. {
  213. foreach ($c as $c1)
  214. $str .= "# ".$c1."\n";
  215. }
  216. $str .= "\n#########################################################################\n";
  217. echo $str;
  218. unset($str);
  219. }

Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn