Home  >  Article  >  Backend Development  >  URL crawler

URL crawler

WBOY
WBOYOriginal
2016-07-25 08:48:111240browse
If you need points-free download of csdn, point-free download of pudn, or point-free 51cto, please go to http://www.itziy.com/
and execute it from the command line. Direct php call will display the usage method
Function description
1. Support agent
2. Supports setting the number of recursive checks
3. Supports output type control and check content control

Function:
主要代替肉眼尽量多的抓取可能的请求包及url地址等,方便渗透测试
  1. error_reporting(E_ERROR | E_WARNING | E_PARSE);
  2. ini_set('memory_limit','1024M');
  3. set_time_limit(0);
  4. define('CHECK_A_TAG', false);
  5. define('CHECK_JS_TAG', true);
  6. define('CHECK_URL', true);
  7. define('SAVE_ERROR', true);
  8. $checkArr = array(
  9. '$.load',
  10. '.ajax',
  11. '$.post',
  12. '$.get',
  13. '.getJSON'
  14. );
  15. if ($argc < 2)
  16. die(showerror('sorry, parameter error', array('example: php debug.php url num filename header proxy', 'detail information:', 'url: target url address which you want to check it', 'num: The number of pages of recursive,default 3', 'filename: output filename default name ret.txt', 'header: The request header file default null', 'proxy: if you want to use proxy set it here default no use proxy')));
  17. if (!check_extension())
  18. die(showerror('extension curl not support', 'please open php curl extension support'));
  19. //global variable
  20. $url = trim($argv[1]);
  21. if (stripos($url, 'http') === false)
  22. $url = 'http://'.$url;
  23. $num = isset($argv[2]) ? intval($argv[2]) : 3;
  24. $output = isset($argv[3]) ? trim(str_replace("\", '/', $argv[3])) : str_replace("\", '/', dirname(__FILE__)).'/ret.txt';
  25. $header = null;
  26. $proxy = null;
  27. $host = null;
  28. if (isset($argv[4]))
  29. {
  30. $header = trim(str_replace("\", '/', $argv[4]));
  31. if (file_exists($header))
  32. $header = array_filter(explode("n", str_replace("r", '', file_get_contents($header))));
  33. else
  34. {
  35. $file = str_replace("\", '/', dirname(__FILE__)).'/'.$header;
  36. if (file_exists($file))
  37. $header = array_filter(explode("n", str_replace("r", '', file_get_contents($file))));
  38. else
  39. $header = null;
  40. }
  41. }
  42. if (isset($argv[5]))
  43. $proxy = trim($argv[5]);
  44. if (!is_array($header) || empty($header))
  45. $header = null;
  46. $result = check_valid_url($url);
  47. $outputArr = array();
  48. if (!empty($result))
  49. {
  50. $result = str_replace("r", '', $result);
  51. $result = str_replace("n", '', $result);
  52. $tmpArr = parse_url($url);
  53. if (!isset($tmpArr['host']))
  54. die(showerror('parse url error', 'can not get host form url: '.$url));
  55. $host = $tmpArr['host'];
  56. if (stripos($host, 'http') === false)
  57. $host = 'http://'.$host;
  58. unset($tmpArr);
  59. //check for current page
  60. if (!isset($outputArr[md5($url)]))
  61. {
  62. $outputArr[md5($url)] = $url;
  63. file_put_contents($output, $url."n", FILE_APPEND);
  64. echo 'url: ',$url,' find ajax require so save it',PHP_EOL;
  65. }
  66. work($result);
  67. }
  68. echo 'run finish',PHP_EOL;
  69. function work($result, $reverse = false)
  70. {
  71. global $num, $host, $outputArr, $checkArr, $output;
  72. if (!$result)
  73. return;
  74. $result = str_replace("r", '', $result);
  75. $result = str_replace("n", '', $result);
  76. while ($num > 0)
  77. {
  78. echo 'remain: ',$num,' now start to check for url address',PHP_EOL,PHP_EOL;
  79. preg_match_all('//i', $result, $match);
  80. if (CHECK_A_TAG && isset($match[2]) && !empty($match[2]))
  81. {
  82. foreach ($match[2] as $mc)
  83. {
  84. $mc = trim($mc);
  85. if ($mc == '#')
  86. continue;
  87. if (stripos($mc, 'http') === false)
  88. $mc = $host.$mc;
  89. if (($ret = check_valid_url($mc)))
  90. {
  91. if (!isset($outputArr[md5($mc)]))
  92. {
  93. $outputArr[md5($mc)] = $mc;
  94. file_put_contents($output, $mc."n", FILE_APPEND);
  95. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  96. }
  97. }
  98. }
  99. }
  100. //check for page url
  101. echo 'remain: ',$num,' now start to check for page url',PHP_EOL,PHP_EOL;
  102. preg_match_all('/(https?|ftp|mms)://([A-z0-9]+[_-]?[A-z0-9]+.)*[A-z0-9]+-?[A-z0-9]+.[A-z]{2,}(/.*)*/?/i', $result, $match);
  103. if (CHECK_URL && isset($match[2]) && !empty($match[2]))
  104. {
  105. foreach ($match[2] as $mc)
  106. {
  107. $mc = trim($mc);
  108. if ($mc == '#')
  109. continue;
  110. if (stripos($mc, 'http') === false)
  111. $mc = $host.$mc;
  112. if (($ret = check_valid_url($mc)))
  113. {
  114. if (!isset($outputArr[md5($mc)]))
  115. {
  116. $outputArr[md5($mc)] = $mc;
  117. file_put_contents($output, $mc."n", FILE_APPEND);
  118. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  119. }
  120. }
  121. }
  122. }
  123. //check for javascript ajax require
  124. echo 'remain: ',$num,' now start to check for javascript ajax require',PHP_EOL,PHP_EOL;
  125. preg_match_all('//i', $result, $match);
  126. if (CHECK_JS_TAG && isset($match[2]) && !empty($match[2]))
  127. {
  128. foreach ($match[2] as $mc)
  129. {
  130. $mc = trim($mc);
  131. if ($mc == '#')
  132. continue;
  133. if (stripos($mc, 'http') === false)
  134. $mc = $host.$mc;
  135. if (($ret = check_valid_url($mc)))
  136. {
  137. //check for current page
  138. foreach ($checkArr as $ck)
  139. {
  140. if (!isset($outputArr[md5($mc)]) && strpos($ret, $ck) !== false)
  141. {
  142. $outputArr[md5($mc)] = $mc;
  143. file_put_contents($output, $mc."n", FILE_APPEND);
  144. echo 'url: ',$mc,' find ajax require so save it',PHP_EOL;
  145. break;
  146. }
  147. }
  148. }
  149. }
  150. }
  151. if ($reverse)
  152. return;
  153. //check for next page
  154. preg_match_all('//i', $result, $match);
  155. if (isset($match[2]) && !empty($match[2]))
  156. {
  157. echo 'check for next page, remain page counts: ',$num,PHP_EOL;
  158. foreach ($match[2] as $mc)
  159. {
  160. $mc = trim($mc);
  161. if ($mc == '#')
  162. continue;
  163. if (stripos($mc, 'http') === false)
  164. $mc = $host.$mc;
  165. echo 'check for next page: ',$mc,PHP_EOL;
  166. work(check_valid_url($mc), true);
  167. }
  168. }
  169. $num--;
  170. sleep(3);
  171. }
  172. }
  173. function check_valid_url($url)
  174. {
  175. if (stripos($url, 'http') === false)
  176. $url = 'http://'.$url;
  177. $ch = curl_init();
  178. curl_setopt($ch, CURLOPT_URL, $url);
  179. curl_setopt($ch, CURLOPT_HEADER, true);
  180. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  181. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  182. curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
  183. if (!is_null($header))
  184. curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  185. if (!is_null($proxy))
  186. curl_setopt($ch, CURLOPT_PROXY, $proxy);
  187. $ret = curl_exec($ch);
  188. $errinfo = curl_error($ch);
  189. curl_close($ch);
  190. unset($ch);
  191. if (!empty($errinfo) || ((strpos($ret, '200 OK') === false) && (strpos($ret, '302 Moved') === false)) || strpos($ret, '114so.cn') !== false)
  192. {
  193. showerror('check url: '.$url. ' find some errors', array($errinfo, $ret));
  194. if (SAVE_ERROR)
  195. file_put_contents(dirname(__FILE__).'/error.txt', $url."n", FILE_APPEND);
  196. return false;
  197. }
  198. return $ret;
  199. }
  200. function check_extension()
  201. {
  202. if (!function_exists('curl_init') || !extension_loaded('curl'))
  203. return false;
  204. return true;
  205. }
  206. function showerror($t, $c)
  207. {
  208. $str = "#########################################################################n";
  209. $str .= "# ".$t."n";
  210. if (is_string($c))
  211. $str .= "# ".$c;
  212. elseif (is_array($c) && !empty($c))
  213. {
  214. foreach ($c as $c1)
  215. $str .= "# ".$c1."n";
  216. }
  217. $str .= "n#########################################################################n";
  218. echo $str;
  219. unset($str);
  220. }
复制代码


Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn