Web page capture

WBOY
WBOYOriginal
2016-07-25 08:49:441179browse
  1. header('Content-Type:text/html;charset=utf-8');
  2. /**
  3. * A class for grabbing pictures
  4. * @author tangpan
  5. */
  6. class download_image {
  7. public $_save_path = NULL; / /Picture saving road
  8. public $_limit_size = NULL; //Limit the size of the picture
  9. public static $_img_url_old = array(); //Storage the captured picture link address
  10. public static $_a_page_url = array(); // Store the crawled page
  11. public function __construct( $_save_path, $_limit_size) {
  12. $this->_save_path = $_save_path;
  13. $this->_limit_size = $_limit_size;
  14. }
  15. public function get_all_page_image( $site_url ) {
  16. if ( $site_url == '' ) {
  17. return false;
  18. }
  19. if ( ! in_array( $site_url, self::$_a_page_url ) ) { //Determine whether the current page has been crawled
  20. self::$ _a_page_url[] = $site_url; //Save the hyperlink into a static array
  21. } else {
  22. return; //If it has been crawled, jump out
  23. }
  24. $this->download_the_page_image( $site_url );
  25. $content = @file_get_contents($site_url);
  26. $a_page_url = "|]+href=['" ]?([^ '"?]+)['" >]|U";
  27. $ all_url = array();
  28. preg_match_all( $a_page_url, $content, $all_url, PREG_SET_ORDER );
  29. if ( $all_url != NULL ) {
  30. foreach( $all_url as $key => $val ) {
  31. /**
  32. * Static hyperlinks to prevent entering an infinite loop
  33. * Exit the current page link representation ('', '#', '/')
  34. */
  35. if ( trim($val[1]) != '' && ! in_array( $val[1], self::$_a_page_url ) && ! in_array( $val[1], array('# ','/',$site_url) ) ) {
  36. self::$_a_page_url[] = $val[1]; //Write qualified hyperlinks into a static array
  37. }
  38. }
  39. }
  40. if ( self ::$_a_page_url != NULL ) {
  41. foreach( self::$_a_page_url as $keys => $vals ) {
  42. if ( strpos( $vals, 'http://' ) === false ) { // When the hyperlink does not contain http://, it cannot be accessed directly
  43. // When the image link address is a relative address, the address is reassembled
  44. $a_domain_url = substr( $site_url, 0, strpos( $site_url, '/',8 ) + 1 );
  45. $a_img_url = $a_domain_url.$vals;
  46. }
  47. //Recursive call, access each hyperlink page
  48. $this->get_all_page_image( $a_img_url );
  49. }
  50. }
  51. }
  52. /**
  53. * Download all image links under the current page
  54. * @param $site_url
  55. */
  56. public function download_the_page_image( $site_url ) {
  57. // Get all the contents of the current link address page
  58. $img_pattern = NULL;
  59. $content = @file_get_contents( $site_url );
  60. $img_pattern = "|< img[^>]+src=['" ]?([^ '"?]+)['" >]|U";
  61. //Globally match all image links in
  62. $img_out = array();
  63. preg_match_all( $img_pattern, $content, $img_out, PREG_SET_ORDER );
  64. echo '

    '. $site_url . 'Total found' . count($img_out) . 'Pictures< /h1>';

  65. //print_r($img_out[1]);
  66. foreach( $img_out as $key => $val ) {
  67. //echo htmlspecialchars($val[1]).'
    ';
  68. $this->save_one_image( $site_url, $val[1]);
  69. }
  70. }
  71. public function save_one_image( $site_url, $img_url ) {
  72. if ( strpos( $img_url, 'http ://' ) === false ) {
  73. // When the image link address is a relative address, the address is reassembled
  74. $domain_url = substr( $site_url, 0, strpos( $site_url, '/',8 ) + 1 ) ;
  75. $img_url = $domain_url.$img_url;
  76. }
  77. $pic_name = basename( $img_url ); //Get the picture name
  78. if ( in_array( $img_url, self::$_img_url_old ) ) {
  79. echo $img_url . 'This image has been captured!
    ';
  80. return;
  81. }
  82. //Get the image content and write it into a string
  83. $img_data = @file_get_contents( $img_url );
  84. if ( strlen($img_data) < $this->_limit_size ) { //The image size is within the limit
  85. $img_boo = @file_put_contents( $this->_save_path.md5(microtime()).$pic_name, $img_data );
  86. if ( $img_boo ) {
  87. echo $img_url .'The picture was saved successfully!
    ';
  88. self::$_img_url_old[] = $img_url;
  89. } else {
  90. echo $img_url .'Picture saving failed!
    ';
  91. }
  92. } else {
  93. echo $img_url .'The image size is within the limit outside!
    ';
  94. }
  95. }
  96. }
  97. set_time_limit(0);
  98. $download_images = new download_image('surces_Img/',1024*1024*100);
  99. $download_images-> get_all_page_image('http://www.22mm.cc/');
  100. ?>
Copy code


Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn