search

Home  >  Q&A  >  body text

Crawler pictures - Please tell me: python crawler encoding problem, version 3.6, win10 64-bit?

This is the error message:

Traceback (most recent call last):
  File "D:\py\pic_downfrom2255ok.py", line 45, in <module>
    html = getHtml(url_all[i])
  File "D:\py\pic_downfrom2255ok.py", line 32, in getHtml
    html = response.read().decode()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 184: invalid start byte

Many places have been changed. The main reason may be that the target website is encoded in gb2312.
This program can download pictures normally on other websites, but there will be problems when changing to the current website.
Please give me your advice. where is the problem? I tried several methods but nothing worked.
The source code is as follows:

#coding=utf-8
import urllib.request
from urllib.request import urlopen, urlretrieve 
import urllib
import urllib.parse
import re
import os
from bs4 import BeautifulSoup


url_all =[
'http://www.shop2255.com/showpro/2603.html',
'http://www.shop2255.com/showpro/1558.html',
'http://www.shop2255.com/showpro/1564.html',
'http://www.shop2255.com/showpro/2411.html',
'http://www.shop2255.com/showpro/2409.html',
'http://www.shop2255.com/showpro/1561.html',
'http://www.shop2255.com/showpro/2414.html',
'http://www.shop2255.com/showpro/2609.html',
'http://www.shop2255.com/showpro/2413.html',
'http://www.shop2255.com/showpro/2604.html',
'http://www.shop2255.com/showpro/2605.html',
'http://www.shop2255.com/showpro/2606.html',
'http://www.shop2255.com/showpro/2608.html',
'http://www.shop2255.com/showpro/2607.html',
'http://www.shop2255.com/showpro/2610.html']

def getHtml(url):
    response = urlopen(url)
    html = response.read().decode("gbk")
    return html


def getImg(html):
    reg = 'src="(.+?\.jpg)"'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)

    return imglist

for i in range(len(url_all)):
    html = getHtml(url_all[i])
    list=getImg(html.decode())
    x = 0
    for imgurl in list:
        print(x)
        file_path = url_all[i]
        (filepath,tempfilename) = os.path.split(file_path)
        (filename,extension) = os.path.splitext(tempfilename)
        
        if not os.path.exists('d:\%s' % filename):
            os.mkdir('d:\%s' % filename)
        # os.mkdir('D:\%s' % filename2)
        
        local=r'D:\%s\%s.jpg' % (filename,imgurl.splite("/")[-1])
        urllib.request.urlretrieve(imgurl,local)
        x+=1
print("done")
伊谢尔伦伊谢尔伦2805 days ago1019

reply all(2)I'll reply

  • 天蓬老师

    天蓬老师2017-05-18 10:55:14

    # coding: utf-8
    
    import urllib
    import requests
    from pyquery import PyQuery as Q
    import os
    
    base_url = 'http://www.shop2255.com/'
    
    
    url_all =['http://www.shop2255.com/showpro/2603.html']
    
    
    for url in url_all:
        _, file_name = os.path.split(url)
        dir_name, _ = os.path.splitext(file_name)
    
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
    
        r = requests.get(url)
        for _ in Q(r.text).find('img'):
            src = Q(_).attr('src')
            image_url = src if src.startswith('http') else os.path.join(base_url, src)
            _, image_name = os.path.split(image_url)
    
            image_path = os.path.join(dir_name, image_name)
            urllib.urlretrieve(image_url, image_path)

    reply
    0
  • 漂亮男人

    漂亮男人2017-05-18 10:55:14

    First in your code local=r'D:%s%s.jpg' % (filename,imgurl.splite("/")[-1])split写成了splite.

    Also urllib.request.urlretrieve(imgurl,local)这个imgurl不是一个合法的
    url,只是一个相对 url, 要改成绝对 url,需要加上 base_url = 'http://www.shop2255.com/'

    There also seems to be a problem with the generated file path.

    # -*- coding: utf-8 -*-
    
    import urllib.request
    from urllib.request import urlopen, urlretrieve
    import urllib
    import urllib.parse
    import re
    import os
    from bs4 import BeautifulSoup
    
    base_url = 'http://www.shop2255.com/'
    
    url_all =[
    'http://www.shop2255.com/showpro/2603.html',
    'http://www.shop2255.com/showpro/1558.html',
    'http://www.shop2255.com/showpro/1564.html',
    'http://www.shop2255.com/showpro/2411.html',
    'http://www.shop2255.com/showpro/2409.html',
    'http://www.shop2255.com/showpro/1561.html',
    'http://www.shop2255.com/showpro/2414.html',
    'http://www.shop2255.com/showpro/2609.html',
    'http://www.shop2255.com/showpro/2413.html',
    'http://www.shop2255.com/showpro/2604.html',
    'http://www.shop2255.com/showpro/2605.html',
    'http://www.shop2255.com/showpro/2606.html',
    'http://www.shop2255.com/showpro/2608.html',
    'http://www.shop2255.com/showpro/2607.html',
    'http://www.shop2255.com/showpro/2610.html']
    
    def getHtml(url):
        response = urlopen(url)
        # print(response.read())
        html = response.read().decode("gbk")
        print(html)
        return html
    
    
    def getImg(html):
        reg = 'src="(.+?\.jpg)"'
        imgre = re.compile(reg)
        imglist = re.findall(imgre, html)
        return imglist
    
    for i in range(len(url_all)):
        html = getHtml(url_all[i])
        # 注意: 我这里没有你那个错误,我只需要改这个就行了
        # list = getImg(html.decode())
        list = getImg(html)
        # print(list)
        x = 0
        for imgurl in list:
            print(x)
            file_path = url_all[i]
            (filepath, tempfilename) = os.path.split(file_path)
            (filename, extension) = os.path.splitext(tempfilename)
    
            if not os.path.exists('d:\%s' % filename):
                os.mkdir('d:\%s' % filename)
            # os.mkdir('D:\%s' % filename2)
    
            local = r'D:\%s\%s.jpg' % (filename, imgurl.split("/")[-1])
            try:
                urllib.request.urlretrieve(base_url + imgurl, local)
            except:
                print("can't retrieve the" + base_url + imgurl)
            x += 1
    
    print("done")
    

    reply
    0
  • Cancelreply