首頁  >  文章  >  後端開發  >  URL抓取工具

URL抓取工具

WBOY
WBOY原創
2016-07-25 08:48:111239瀏覽
有需要csdn免積分下載、pudn免積分下載、51cto免積分,請到http://www.itziy.com/
命令列下執行,直接php呼叫將顯示使用方式
功能說明
1.支援代理
2.支援設定遞歸檢查次數
3.支援輸出類型控制、檢查內容控制

作用:
主要代替肉眼盡量多的抓取可能的請求包及url位址等,方便滲透測試
  1. error_reporting(E_ERROR | E_WARNING | E_PARSE);
  2. ini_set('memory_limit','1024M');ini_set('memory_limit','1024M');
  3. define('CHECK_A_TAG', false);
  4. define('CHECK_JS_TAG', true);
  5. define('CHECK_URL', true);
  6. define('SAVE_ERROR', );
  7. $checkArr = array(
  8. '$.load',
  9. '.ajax',
  10. '$.post',
  11. '$.get',
  12. '.getJSON'
  13. );
  14. if ($argc die(showerror('sorry, parameter error', array('example: php debug.php url num filename header proxy', 'detail information:', 'url: target url address which you want to check it', 'num: The number of pages of recursive,default 3', 'filename: output filename default name ret.txt', 'header: The request header file default null', 'proxy: if you want to use proxy set it here default no use proxy')));
  15. if (!check_extension())
  16. die(showerror('extension!check_extension())
  17. die(showerror('extension | support', 'please open php curl extension support'));
  18. //global variable
  19. $url = trim($argv[1]);
  20. if (stripos($url, ' http') === false)
  21. $url = 'http://'.$url;
  22. $num = isset($argv[2]) ? intval($argv[2]) : 3;
  23. $output = isset($argv[3]) ? trim(str_replace("\", '/', $argv[3])) : str_replace("\", '/', dirname(__FILE__)) .'/ret.txt';
  24. $header = null;
  25. $proxy = null;
  26. $host = null;
  27. if (isset($argv[4]))
  28. {
  29. $header = trim(str_replace("\", '/', $argv[4]));
  30. if (file_exists($header))
  31. $header = array_filter(explode( "n", str_replace("r", '', file_get_contents($header)))));
  32. else
  33. {
  34. $file = str_replace("\", '/', dirname(__FILE__) ).'/'.$header;
  35. if (file_exists($file))
  36. $header = array_filter(explode("n", str_replace("r", '', file_get_contents($file))) );
  37. else
  38. $header = null;
  39. }
  40. }
  41. if (isset($argv[5]))
  42. $proxy = trim($argv[ 5]);
  43. if (!is_array($header) || empty($header))
  44. $header = null;
  45. $result = check_valid_url($url);
  46. $$ outputArr = array();
  47. if (!empty($result))
  48. {
  49. $result = str_replace("r", '', $result);
  50. $result = str_replace("n", '', $result);
  51. $tmpArr = parse_url($url);
  52. if (!isset($tmpArr['host']))
  53. die(showerror(' parse url error', 'can not get host form url: '.$url));
  54. $host = $tmpArr['host'];
  55. if (stripos($host, 'http') == = false)
  56. $host = 'http://'.$host;
  57. unset($tmpArr);
  58. //check for current page
  59. if (!isset($outputArr [md5($url)]))
  60. {
  61. $outputArr[md5($url)] = $url;
  62. file_put_contents($output, $url."n", FILE_APPEND);
  63. echo 'url: ',$url,' find ajax require so save it',PHP_EOL;
  64. }
  65. work($result);
  66. }
  67. echo 'run finish',PHP_EOL;
  68. }
  69. echo 'run finish',PHP_EOL;
  70. function work($result, $reverse = false)
  71. {
  72. global $num, $host, $outputArr, $checkArr, $output;
  73. if (!$result)
  74. return;
  75. $result = str_replace("r", '', $result);
  76. $result = str_replace("n", '', $result);
  77. while ($num > 0)
  78. {
  79. echo 'remain: ',$num,' now start to check for url address',PHP_EOL,PHP_EOL;
  80. preg_match_all('//i', $result, $match);
  81. if (CHECK_A_TAG && isset($match[2 ]) && !empty($match[2]))
  82. {
  83. foreach ($match[2] as $mc)
  84. {
  85. $mc = trim($mc);
  86. if ($mc == '#')
  87. continue;
  88. if (stripos($mc, 'http') === false)
  89. $mc = $host.$mc;
  90. if (($ret = check_valid_url($mc)))
  91. {
  92. if (!isset($outputArr[md5($mc)]))
  93. {
  94. $outputArr[md5( $mc)] = $mc;
  95. file_put_contents($output, $mc."n", FILE_APPEND);
  96. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  97. }
  98. }
  99. }
  100. }
  101. //check for page url
  102. echo 'remain: ',$num,' now start to check for page url',PHP_EOL ,PHP_EOL; preg_match_all('/(https?|ftp|mms)://([A-z0-9] [_-]?[A-z0-9] .)*[A-z0-9 ] -?[A-z0-9] .[A-z]{2,}(/.*)*/?/i', $result, $match);
  103. if (CHECK_URL && isset($match[2]) && !empty($match[2]))
  104. {
  105. foreach ($match[2] as $mc)
  106. {
  107. $mc = trim($mc);
  108. if ($mc == '#')
  109. continue;
  110. if (stripos($mc, 'http') === false)
  111. $mc = $host.$mc;
  112. if (($ret = check_valid_url($mc)))
  113. {
  114. if (!isset($outputArr[md5($mc)]))
  115. {
  116. $outputArr[md5($mc)] = $mc;
  117. file_put_contents($output, $mc ) ) ."n", FILE_APPEND);
  118. echo 'url: ',$mc,' 找 ajax 需要,保存因此',PHP_EOL;
  119. }
  120. }
  121. }
  122. }
  123. //檢查javascript ajax require
  124. echo 'remain: ',$num,' 現在開始檢查javascript ajax require',PHP_EOL,PHP_EOL;
  125. preg_match_all('//i', $result, $match);
  126. if (CHECK_JS_TAG && isset($ match[2 ]) && !empty($match[2]))
  127. {
  128. foreach ($match[2] as $mc)
  129. {
  130. $mc = trim($mc);
  131. if ($mc == '#')
  132. continue;
  133. if (stripos($mc, 'http') === false)
  134. $mc = $host.$mc;
  135. if (($ret = check_valid_url($mc)))
  136. {
  137. //檢查目前頁
  138. foreach ($checkArr as $ck)
  139. {
  140. if ( !isset($outputArr[md5($mc)]) && strpos($ret, $ck) !== false)
  141. {
  142. $outputArr[md5($mc)] = $mc;
  143. file_put_contents($output, $mc."n", FILE_APPEND);
  144. echo 'url: ',$mc,' 找到ajax require 所以保存它',PHP_EOL;
  145. break;
  146. }
  147. }
  148. }
  149. }
  150. }
  151. if ($reverse)
  152. return;
  153. //檢查下一頁
  154. preg_match_all('//i', $結果, $match);
  155. if ( isset( $match[2]) && !empty($match[2]))
  156. {
  157. echo '檢查下一頁,剩餘頁數:',$num,PHP_EOL;
  158. foreach ($ match[ 2] as $mc)
  159. {
  160. $mc = trim($mc);
  161. if ($mc == '#')
  162. continue;
  163. if (stripos($ mc, 'http') === false)
  164. $mc = $host.$mc;
  165. echo '檢查下一頁:',$mc,PHP_EOL;
  166. work(check_valid_url($mc) , true );
  167. }
  168. }
  169. $num--;
  170. sleep(3);
  171. }
  172. }
  173. 函數check_valid_url($url)
  174. {
  175. if (stripos($url, 'http') === false)
  176. $url = 'http://'.$url;
  177. $ch = curl_init();
  178. curl_setopt( $ch, CURLOPT_URL, $url);
  179. curl_setopt($ch, CURLOPT_HEADER, true);
  180. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  181. curl_togffchUR CURS,$
  182. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (相容;Google 機器人/2.1;http://www.google.com/bot.html)');
  183. if (!is_null($header) )
  184. curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  185. if (!is_null ($proxy))
  186. curl_setopt($ch, CURLOPT_PROXY, $proxy);
  187. $pret. ch);
  188. $errinfo = curl_error($ch);
  189. curl_close($ch) );
  190. unset($ch);
  191. if (!empty($errinfo) || ((strpos ) ($ret, '200 OK') === false) && (strpos($ret, '302)移動') ===已已)) || strpos($ret, '114so.cn') !== false)
  192. {
  193. showerror('檢查 url: '.$url.' 發現一些錯誤' , array($errinfo, $ret));
  194. if (SAVE_ERROR)
  195. file_put_contents(dirname(__FILE__).'/error.txt', $url."n", FILE_APPEND);
  196. return false ;
  197. }
  198. }
  199. return false ;
  200. }
  201. }
  202. return $ret ;
  203. }
  204. function check_extension()
  205. {
  206. if (!function_exists('curl_init') || !extension_loaded('curl' )))
  207. 返回false>回傳true;
  208. }
  209. function showerror($t, $c)
  210. {
  211. $str = "####### ############ ########################################################### #######n";
  212. $str .= "# ".$t."n";
  213. if (is_string($c))
  214. $str .= "# "。 $c;
  215. elseif (is_array($c) && !empty($c))
  216. {
  217. foreach ($c as $c1)
  218. $ str .= "# ".$c1. "n";
  219. }
  220. $str .= "n################################ # ##########################################n";
echo $str ;
unset($str);}
複製程式碼


陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn