Heim  >  Artikel  >  Backend-Entwicklung  >  URL抓取工具

URL抓取工具

WBOY
WBOYOriginal
2016-07-25 08:48:111240Durchsuche
有需要csdn免积分下载、pudn免积分下载、51cto免积分,请到http://www.itziy.com/
命令行下执行,直接php调用将显示使用方式
功能说明
1.支持代理
2.支持设置递归检查次数
3.支持输出类型控制、检查内容控制

作用:
主要代替肉眼尽量多的抓取可能的请求包及url地址等,方便渗透测试
  1. error_reporting(E_ERROR | E_WARNING | E_PARSE);
  2. ini_set('memory_limit','1024M');
  3. set_time_limit(0);
  4. define('CHECK_A_TAG', false);
  5. define('CHECK_JS_TAG', true);
  6. define('CHECK_URL', true);
  7. define('SAVE_ERROR', true);
  8. $checkArr = array(
  9. '$.load',
  10. '.ajax',
  11. '$.post',
  12. '$.get',
  13. '.getJSON'
  14. );
  15. if ($argc die(showerror('sorry, parameter error', array('example: php debug.php url num filename header proxy', 'detail information:', 'url: target url address which you want to check it', 'num: The number of pages of recursive,default 3', 'filename: output filename default name ret.txt', 'header: The request header file default null', 'proxy: if you want to use proxy set it here default no use proxy')));
  16. if (!check_extension())
  17. die(showerror('extension curl not support', 'please open php curl extension support'));
  18. //global variable
  19. $url = trim($argv[1]);
  20. if (stripos($url, 'http') === false)
  21. $url = 'http://'.$url;
  22. $num = isset($argv[2]) ? intval($argv[2]) : 3;
  23. $output = isset($argv[3]) ? trim(str_replace("\\", '/', $argv[3])) : str_replace("\\", '/', dirname(__FILE__)).'/ret.txt';
  24. $header = null;
  25. $proxy = null;
  26. $host = null;
  27. if (isset($argv[4]))
  28. {
  29. $header = trim(str_replace("\\", '/', $argv[4]));
  30. if (file_exists($header))
  31. $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($header))));
  32. else
  33. {
  34. $file = str_replace("\\", '/', dirname(__FILE__)).'/'.$header;
  35. if (file_exists($file))
  36. $header = array_filter(explode("\n", str_replace("\r", '', file_get_contents($file))));
  37. else
  38. $header = null;
  39. }
  40. }
  41. if (isset($argv[5]))
  42. $proxy = trim($argv[5]);
  43. if (!is_array($header) || empty($header))
  44. $header = null;
  45. $result = check_valid_url($url);
  46. $outputArr = array();
  47. if (!empty($result))
  48. {
  49. $result = str_replace("\r", '', $result);
  50. $result = str_replace("\n", '', $result);
  51. $tmpArr = parse_url($url);
  52. if (!isset($tmpArr['host']))
  53. die(showerror('parse url error', 'can not get host form url: '.$url));
  54. $host = $tmpArr['host'];
  55. if (stripos($host, 'http') === false)
  56. $host = 'http://'.$host;
  57. unset($tmpArr);
  58. //check for current page
  59. if (!isset($outputArr[md5($url)]))
  60. {
  61. $outputArr[md5($url)] = $url;
  62. file_put_contents($output, $url."\n", FILE_APPEND);
  63. echo 'url: ',$url,' find ajax require so save it',PHP_EOL;
  64. }
  65. work($result);
  66. }
  67. echo 'run finish',PHP_EOL;
  68. function work($result, $reverse = false)
  69. {
  70. global $num, $host, $outputArr, $checkArr, $output;
  71. if (!$result)
  72. return;
  73. $result = str_replace("\r", '', $result);
  74. $result = str_replace("\n", '', $result);
  75. while ($num > 0)
  76. {
  77. echo 'remain: ',$num,' now start to check for url address',PHP_EOL,PHP_EOL;
  78. preg_match_all('//i', $result, $match);
  79. if (CHECK_A_TAG && isset($match[2]) && !empty($match[2]))
  80. {
  81. foreach ($match[2] as $mc)
  82. {
  83. $mc = trim($mc);
  84. if ($mc == '#')
  85. continue;
  86. if (stripos($mc, 'http') === false)
  87. $mc = $host.$mc;
  88. if (($ret = check_valid_url($mc)))
  89. {
  90. if (!isset($outputArr[md5($mc)]))
  91. {
  92. $outputArr[md5($mc)] = $mc;
  93. file_put_contents($output, $mc."\n", FILE_APPEND);
  94. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  95. }
  96. }
  97. }
  98. }
  99. //check for page url
  100. echo 'remain: ',$num,' now start to check for page url',PHP_EOL,PHP_EOL;
  101. preg_match_all('/(https?|ftp|mms):\/\/([A-z0-9]+[_\-]?[A-z0-9]+\.)*[A-z0-9]+\-?[A-z0-9]+\.[A-z]{2,}(\/.*)*\/?/i', $result, $match);
  102. if (CHECK_URL && isset($match[2]) && !empty($match[2]))
  103. {
  104. foreach ($match[2] as $mc)
  105. {
  106. $mc = trim($mc);
  107. if ($mc == '#')
  108. continue;
  109. if (stripos($mc, 'http') === false)
  110. $mc = $host.$mc;
  111. if (($ret = check_valid_url($mc)))
  112. {
  113. if (!isset($outputArr[md5($mc)]))
  114. {
  115. $outputArr[md5($mc)] = $mc;
  116. file_put_contents($output, $mc."\n", FILE_APPEND);
  117. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  118. }
  119. }
  120. }
  121. }
  122. //check for javascript ajax require
  123. echo 'remain: ',$num,' now start to check for javascript ajax require',PHP_EOL,PHP_EOL;
  124. preg_match_all('//i', $result, $match);
  125. if (CHECK_JS_TAG && isset($match[2]) && !empty($match[2]))
  126. {
  127. foreach ($match[2] as $mc)
  128. {
  129. $mc = trim($mc);
  130. if ($mc == '#')
  131. continue;
  132. if (stripos($mc, 'http') === false)
  133. $mc = $host.$mc;
  134. if (($ret = check_valid_url($mc)))
  135. {
  136. //check for current page
  137. foreach ($checkArr as $ck)
  138. {
  139. if (!isset($outputArr[md5($mc)]) && strpos($ret, $ck) !== false)
  140. {
  141. $outputArr[md5($mc)] = $mc;
  142. file_put_contents($output, $mc."\n", FILE_APPEND);
  143. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  144. break;
  145. }
  146. }
  147. }
  148. }
  149. }
  150. if ($reverse)
  151. return;
  152. //check for next page
  153. preg_match_all('//i', $result, $match);
  154. if (isset($match[2]) && !empty($match[2]))
  155. {
  156. echo 'check for next page, remain page counts: ',$num,PHP_EOL;
  157. foreach ($match[2] as $mc)
  158. {
  159. $mc = trim($mc);
  160. if ($mc == '#')
  161. continue;
  162. if (stripos($mc, 'http') === false)
  163. $mc = $host.$mc;
  164. echo 'check for next page: ',$mc,PHP_EOL;
  165. work(check_valid_url($mc), true);
  166. }
  167. }
  168. $num--;
  169. sleep(3);
  170. }
  171. }
  172. function check_valid_url($url)
  173. {
  174. if (stripos($url, 'http') === false)
  175. $url = 'http://'.$url;
  176. $ch = curl_init();
  177. curl_setopt($ch, CURLOPT_URL, $url);
  178. curl_setopt($ch, CURLOPT_HEADER, true);
  179. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  180. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  181. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
  182. if (!is_null($header))
  183. curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  184. if (!is_null($proxy))
  185. curl_setopt($ch, CURLOPT_PROXY, $proxy);
  186. $ret = curl_exec($ch);
  187. $errinfo = curl_error($ch);
  188. curl_close($ch);
  189. unset($ch);
  190. if (!empty($errinfo) || ((strpos($ret, '200 OK') === false) && (strpos($ret, '302 Moved') === false)) || strpos($ret, '114so.cn') !== false)
  191. {
  192. showerror('check url: '.$url. ' find some errors', array($errinfo, $ret));
  193. if (SAVE_ERROR)
  194. file_put_contents(dirname(__FILE__).'/error.txt', $url."\n", FILE_APPEND);
  195. return false;
  196. }
  197. return $ret;
  198. }
  199. function check_extension()
  200. {
  201. if (!function_exists('curl_init') || !extension_loaded('curl'))
  202. return false;
  203. return true;
  204. }
  205. function showerror($t, $c)
  206. {
  207. $str = "#########################################################################\n";
  208. $str .= "# ".$t."\n";
  209. if (is_string($c))
  210. $str .= "# ".$c;
  211. elseif (is_array($c) && !empty($c))
  212. {
  213. foreach ($c as $c1)
  214. $str .= "# ".$c1."\n";
  215. }
  216. $str .= "\n#########################################################################\n";
  217. echo $str;
  218. unset($str);
  219. }
复制代码


Stellungnahme:
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn