search

Home  >  Q&A  >  body text

Python:使用scrapy框架的ImagesPipeline下载图片如何保持原文件名呢?

仔细看了下scrapy的官方文档关于ImagesPipeline的介绍说明及使用例子:Downloading Item Images
感觉官方文档太过简单。
比如:通过在setting.py文件中通过给IMAGES_STORE赋值,就可以指定图片的保存路径。
并且默认情况下,文件名是通过对url使用SHA1 hash得来的。
现在我想以原来的图片名进行保存,不知道该如何做,希望有经验的朋友帮忙指点下~

另外求大家帮忙推荐下关于scrapy的相关书籍(中英文都行)

天蓬老师天蓬老师2804 days ago956

reply all(3)I'll reply

  • ringa_lee

    ringa_lee2017-04-17 12:04:50

    Look at the source code of ImagePipeline and find that you can rewrite the file_path function to modify the image name, for example:

    def file_path(self, request, response=None, info=None):
            open("image_urls.txt","a").write(request.url + "\n")
            image_guid = request.url.split('/')[-1]
            return 'full/%s' % (image_guid)
    

    The source code of ImagePipeline is as follows:

    class ImagesPipeline(FilesPipeline):
        """Abstract pipeline that implement the image thumbnail generation logic
    
        """
    
        MEDIA_NAME = 'image'
        MIN_WIDTH = 0
        MIN_HEIGHT = 0
        THUMBS = {}
        DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
        DEFAULT_IMAGES_RESULT_FIELD = 'images'
    
        @classmethod
        def from_settings(cls, settings):
            cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
            cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
            cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
            cls.THUMBS = settings.get('IMAGES_THUMBS', {})
            s3store = cls.STORE_SCHEMES['s3']
            s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
            s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
    
            cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
            cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
            store_uri = settings['IMAGES_STORE']
            return cls(store_uri)
    
        def file_downloaded(self, response, request, info):
            return self.image_downloaded(response, request, info)
    
        def image_downloaded(self, response, request, info):
            checksum = None
            for path, image, buf in self.get_images(response, request, info):
                if checksum is None:
                    buf.seek(0)
                    checksum = md5sum(buf)
                width, height = image.size
                self.store.persist_file(
                    path, buf, info,
                    meta={'width': width, 'height': height},
                    headers={'Content-Type': 'image/jpeg'})
            return checksum
    
        def get_images(self, response, request, info):
            path = self.file_path(request, response=response, info=info)
            orig_image = Image.open(StringIO(response.body))
    
            width, height = orig_image.size
            if width < self.MIN_WIDTH or height < self.MIN_HEIGHT:
                raise ImageException("Image too small (%dx%d < %dx%d)" %
                                     (width, height, self.MIN_WIDTH, self.MIN_HEIGHT))
    
            image, buf = self.convert_image(orig_image)
            yield path, image, buf
    
            for thumb_id, size in self.THUMBS.iteritems():
                thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
                thumb_image, thumb_buf = self.convert_image(image, size)
                yield thumb_path, thumb_image, thumb_buf
    
        def convert_image(self, image, size=None):
            if image.format == 'PNG' and image.mode == 'RGBA':
                background = Image.new('RGBA', image.size, (255, 255, 255))
                background.paste(image, image)
                image = background.convert('RGB')
            elif image.mode != 'RGB':
                image = image.convert('RGB')
    
            if size:
                image = image.copy()
                image.thumbnail(size, Image.ANTIALIAS)
    
            buf = StringIO()
            image.save(buf, 'JPEG')
            return image, buf
    
        def get_media_requests(self, item, info):
            return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
    
        def item_completed(self, results, item, info):
            if self.IMAGES_RESULT_FIELD in item.fields:
                item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
            return item
    
        def file_path(self, request, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                              'please use file_path(request, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
    
            # check if called from image_key or file_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
    
            # detect if file_key() or image_key() methods have been overridden
            if not hasattr(self.file_key, '_base'):
                _warn()
                return self.file_key(url)
            elif not hasattr(self.image_key, '_base'):
                _warn()
                return self.image_key(url)
            ## end of deprecation warning block
    
            image_guid = hashlib.sha1(url).hexdigest()  # change to request.url after deprecation
            return 'full/%s.jpg' % (image_guid)
    
        def thumb_path(self, request, thumb_id, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                              'thumb_path(request, thumb_id, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
    
            # check if called from thumb_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
    
            # detect if thumb_key() method has been overridden
            if not hasattr(self.thumb_key, '_base'):
                _warn()
                return self.thumb_key(url, thumb_id)
            ## end of deprecation warning block
    
            thumb_guid = hashlib.sha1(url).hexdigest()  # change to request.url after deprecation
            return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
    
        # deprecated
        def file_key(self, url):
            return self.image_key(url)
        file_key._base = True
    
        # deprecated
        def image_key(self, url):
            return self.file_path(url)
        image_key._base = True
    
        # deprecated
        def thumb_key(self, url, thumb_id):
            return self.thumb_path(url, thumb_id)
        thumb_key._base = True
    
    

    reply
    0
  • PHPz

    PHPz2017-04-17 12:04:50

    Now I want to save the picture with the original name

    It seems that there are no direct parameters to pass, so you need to implement the Images Pipeline yourself

    scrapy.contrib.pipeline.images.ImagesPipeline class get_media_requests(item, info) will download the image and feed the result to the item_completed() method. The result is a tuple, (success, image_info_or_failure), where success is a bool indicating whether the download is successful, image_info_or_failure includes url, path and checksum three items. Among them, path is the path (including file name) relative to IMAGES_STORE.

    [(True,
      {'checksum': '2b00042f7481c7b056c4b410d28f33cf',
       'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg',
       'url': 'http://www.example.com/images/product1.jpg'}),
     (True,
      {'checksum': 'b9628c4ab9b595f72f280b90c4fd093d',
       'path': 'full/1ca5879492b8fd606df1964ea3c1e2f4520f076f.jpg',
       'url': 'http://www.example.com/images/product2.jpg'}),
     (False,
      Failure(...))]
    

    The above is an example from the official website.

    So what you need to do is rewrite the item_completed(results, items, info) method and replace item['image_paths'] with the original file name.

    reply
    0
  • PHP中文网

    PHP中文网2017-04-17 12:04:50

    Modifying file_path is too intrusive to the original code. If it is just to modify the file path, you can rename the file in item_completed.

    class NeteaseautoImagePipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url.replace('120x90', '800x600'))
    
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            if item['jk']:
                newname = item['car'] + '-' + item['jk'] + '-' + item['model'] + '.jpg'
            else:
                newname = item['car'] + '-' + item['model'] + '.jpg'
            os.rename("/neteaseauto/" + image_paths[0], "/neteaseauto/" + newname)
            return item

    reply
    0
  • Cancelreply