首页  >  文章  >  后端开发  >  php正则抓取整个域名下的图片

php正则抓取整个域名下的图片

WBOY
WBOY原创
2016-07-25 08:50:05995浏览
代码出处:jUnion

适用平台:Windows, Linux(Ubuntu),php-5.2.5+,Apache

功能:抓取整个站点的图片,暂无借助php的curl插件开发, 后期完善

配置:config目录下
domain_name:域名(默认:bizhibar.com)
request_site:网站网址(默认:http://www.bizhibar.com/)
request_url:从网站的哪个页面开始(默认:http://www.bizhibar.com/)
accept_type: 图片类型(默认:gif, bmp, png, ico, jpg, jpeg)
save_path:图片保存路径(默认:savefiles/)
partition_name:图片保存目录名称前缀(默认:img_)
dir_file_limit: 每个目录容许多少个文件(默认:100)
serialize_img_size: 当读取了多少个图片地址才缓存到cache目录下的accompImg文件当中,下次继续抓取的时候会忽略这些地址。(默认:30)
serialize_url_size:与serialize_url_size一样,已读取多少个链接地址才缓存到cache目录
下的overURL,下次继续抓取的时候忽略这些地址。(默认:10)

说明:欢迎诸君批评指教,有任何新问题或者需要改进的地方,请您反馈给我
  1. set_time_limit(0);
  2. require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture.const.php';
  3. require __Home__.'include'.__Os__.'Capture.class.php';
  4. $_cfg = array(
  5. 'site' => __Home__.'config'.__Os__.'capture.site.php',
  6. 'preg' => __Home__.'config'.__Os__.'capture.preg.php',
  7. 'accompImg' => __Home__.'cache'.__Os__.'accompImg',
  8. 'overURL' => __Home__.'cache'.__Os__.'overURL'
  9. );
  10. $_parse = new Capture( $_cfg );
  11. $_parse->parseQuestUrl();
  12. ?>
复制代码
  1. /**
  2. * The main class
  3. * @author pankai
  4. * @date 2013-08-10
  5. */
  6. class Capture {
  7. private static $_Config = array();
  8. private static $_CapSite = NULL;
  9. private static $_CapPreg = NULL;
  10. private static $_overURL = array();
  11. private $_mark = FALSE;
  12. private static $_markTime = 1;
  13. /**
  14. * initialize the main class: Capture
  15. * @param $_cfg array
  16. */
  17. public function __construct( &$_cfg ) {
  18. self::$_Config = &$_cfg;
  19. self::$_CapSite = require $_cfg['site'];
  20. self::$_CapPreg = require $_cfg['preg'];
  21. foreach( self::$_CapPreg as $_key => $_value ) {
  22. self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value );
  23. }
  24. self::import( 'file.OperateFile' );
  25. if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) {
  26. $_contents = OperateFile::readText( $_cfg['overURL'], filesize( $_cfg['overURL'] ) );
  27. self::$_overURL = unserialize( $_contents );
  28. }
  29. self::import('pivotal.Pivotal');
  30. if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) {
  31. $_contents = OperateFile::readText( $_cfg['accompImg'], filesize( $_cfg['accompImg'] ) );
  32. Pivotal::$_accompImg = unserialize( $_contents );
  33. }
  34. }
  35. /**
  36. * load class, follow Java pragrammer(package): import com.jUnion.Capture
  37. * @param $_class
  38. */
  39. public static function import( $_class ) {
  40. require_once __Home__.'include'.__Os__.str_replace( '.', __Os__, $_class ).'.class.php';
  41. }
  42. /**
  43. * create an instance of Pivotal class
  44. * @param $_source
  45. */
  46. private function getCapInstance( &$_source ) {
  47. $this->_mark = FALSE;
  48. $_Captal = new Pivotal( self::$_Config, $_source );
  49. $_tagA = $_Captal->parseUrl();
  50. $this->_mark = TRUE;
  51. return $_tagA;
  52. }
  53. /**
  54. * go forward one by one
  55. * @param $_tagArr
  56. */
  57. private function roundTagA( &$_tagArr ) {
  58. if( $_tagArr == NULL ) {
  59. return;
  60. }
  61. $_tagArrLength = count( $_tagArr );
  62. for( $i = 0; $i if( is_array( $_tagArr[ $i ] ) ) {
  63. $this->roundTagA( $_tagArr[ $i ] );
  64. }
  65. else {
  66. if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] )
  67. === FALSE ) {
  68. continue;
  69. }
  70. if( in_array( $_tagArr[$i], self::$_overURL ) ) {
  71. continue;
  72. }
  73. self::$_overURL[] = $_tagArr[$i];
  74. if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) {
  75. OperateFile::setText( self::$_Config['overURL'], serialize( self::$_overURL ) );
  76. }
  77. do {
  78. $_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) );
  79. sleep( self::$_CapSite['preform_page_time'] * self::$_markTime );
  80. if( $this->_mark === TRUE ) {
  81. self::$_markTime = self::$_CapSite['preform_page_time'];
  82. break;
  83. }
  84. self::$_markTime *= 2;
  85. } while( true );
  86. /* parse the main page and return next page */
  87. $this->roundTagA( $_tagA );
  88. }
  89. }
  90. }
  91. public function parseQuestUrl() {
  92. self::import('http.Http');
  93. $_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) );
  94. $this->roundTagA( $_round_Arr );
  95. }
  96. }
  97. ?>
复制代码


声明:
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn