搜尋

首頁  >  問答  >  主體

新手求教python3如何把dict循環寫入csv檔(進行爬蟲時遇到的問題)?

爬蟲生成dict後,想將其寫入csv文件,卻出錯
使用jupyter notebook,window環境。

具體程式碼如下

import requests

from multiprocessing.dummy import Pool as ThreadPool

from lxml import etree

import sys

import time

import random

import csv


def spider(url):

    header={

        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

    }

    timeout=random.choice(range(31,50))

    html = requests.get(url,header,timeout=timeout)

    time.sleep(random.choice(range(8,16)))

    selector = etree.HTML(html.text)

    content_field = selector.xpath('//*[@class="inner"]/p[3]/p[2]/ul/li')

    item ={}

    for each in content_field:

        g = each.xpath('a/p[1]/p[1]/h3/span/text()')

        go = each.xpath('a/p[1]/p[2]/p/h3/text()')

        h = each.xpath('a/p[1]/p[2]/p/p/text()[1]')

        j= each.xpath('a/p[1]/p[1]/p/text()[2]')

        ge = each.xpath('a/p[1]/p[2]/p/p/text()[3]')

        x = each.xpath('a/p[1]/p[1]/p/text()[3]')

        city = each.xpath('a/p[1]/p[1]/p/text()[1]')

        gg = each.xpath('a/p[2]/span/text()')

        item['city']="".join(city)

        item['hangye']="".join(hangye)

        item['guimo']="".join(guimo)

        item['gongsi']="".join(gongsi)

        item['gongzi']="".join(gongzi)

        item['jingyan']="".join(jingyan)

        item['xueli']="".join(xueli)

        item['gongzuoneirong']="".join(gongzuoneirong)

        fieldnames =['city','hangye','guimo','gongsi','gongzi','jingyan','xueli','gongzuoneirong']

        with open('bj.csv','a',newline='',errors='ignore')as f:

            f_csv=csv.DictWriter(f,fieldnames=fieldnames)

            f_csv.writeheader()

            f_csv.writerow(item)


if __name__ == '__main__':
    pool = ThreadPool(4)
    f=open('bj.csv','w')
    page = []
    for i in range(1,100):
        newpage = 'https://www.zhipin.com/c101010100/h_101010100/?query=%E6%95%B0%E6%8D%AE%E8%BF%90%E8%90%A5&page='+str(i) + '&ka=page-' + str(i)
        page.append(newpage)
        
    results = pool.map(spider,page)
    pool.close()
    pool.join()
    f.close()

執行上面程式碼,提示錯誤為

ValueError: too many values to unpack (expected 2)
透過查詢原因是要將dict遍歷,需要dict.items()的形式。但在上述程式碼中如何實現,一直沒有理順,求教各位

我想大声告诉你我想大声告诉你2754 天前1434

全部回覆(3)我來回復

  • 習慣沉默

    習慣沉默2017-05-18 10:51:20

    不好意思哈,現在才有時間來回答你的問題,看到你根據我的建議把代碼改過來了,下面我把改過的代碼貼出來,我運行過,是沒問題的

    import requests
    from multiprocessing.dummy import Pool
    from lxml import etree
    import time
    import random
    import csv
    
    
    def spider(url):
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        }
    
        timeout = random.choice(range(31, 50))
        html = requests.get(url, headers=header, timeout=timeout)
    
        time.sleep(random.choice(range(8, 16)))
    
        selector = etree.HTML(html.text)
    
        content_field = selector.xpath('//*[@class="inner"]/p[3]/p[2]/ul/li')
    
        item = {}
    
        for each in content_field:
            g = each.xpath('a/p[1]/p[1]/h3/span/text()')
    
            go = each.xpath('a/p[1]/p[2]/p/h3/text()')
    
            h = each.xpath('a/p[1]/p[2]/p/p/text()[1]')
    
            j = each.xpath('a/p[1]/p[1]/p/text()[2]')
    
            ge = each.xpath('a/p[1]/p[2]/p/p/text()[3]')
    
            x = each.xpath('a/p[1]/p[1]/p/text()[3]')
    
            city = each.xpath('a/p[1]/p[1]/p/text()[1]')
    
            gg = each.xpath('a/p[2]/span/text()')
    
            item['city'] = "".join(city)
    
            item['hangye'] = "".join(g)
    
            item['guimo'] = "".join(go)
    
            item['gongsi'] = "".join(h)
    
            item['gongzi'] = "".join(j)
    
            item['jingyan'] = "".join(ge)
    
            item['xueli'] = "".join(x)
    
            item['gongzuoneirong'] = "".join(gg)
    
            fieldnames = ['city', 'hangye', 'guimo', 'gongsi', 'gongzi', 'jingyan', 'xueli', 'gongzuoneirong']
    
            with open('bj.csv', 'a', newline='', errors='ignore')as f:
                f_csv = csv.DictWriter(f, fieldnames=fieldnames)
    
                f_csv.writeheader()
    
                f_csv.writerow(item)
    
    
    if __name__ == '__main__':
        f = open('bj.csv', 'w')
        page = []
        for i in range(1, 100):
            newpage = 'https://www.zhipin.com/c101010100/h_101010100/?query=%E6%95%B0%E6%8D%AE%E8%BF%90%E8%90%A5&page=' + str(
                i) + '&ka=page-' + str(i)
            page.append(newpage)
        print(page)
        pool = Pool(4)
        results = pool.map(spider, page)
        pool.close()
        pool.join()
        f.close()
    

    這裡主要是header,你原來是set类型,我修改后是dict類型

    這裡還需要給你一些建議

    1. 你的程式碼是放到ide還是文字編輯器中運行的?有的東西在ide下明顯會報錯啊

    2. 建議新手從開始學的時候就遵守PEP8規範,別養成了壞習慣,你看看你的命名

    回覆
    0
  • 过去多啦不再A梦

    过去多啦不再A梦2017-05-18 10:51:20

    item = {'a':1, 'b':2}
    fieldnames = ['a', 'b']
    
    with open('test.csv', 'a') as f:
        f_csv = DictWriter(f, fieldnames=fieldnames)
        f_csv.writeheader()
        f_csv.writerow(item)

    我這樣寫並沒報錯

    writerow就是直接接收dict的吧,你這個問題,我感覺是因為item的key與你表頭不對應

    回覆
    0
  • 漂亮男人

    漂亮男人2017-05-18 10:51:20

    因為在 fields 中指定的某些列名在 item 中不存在

    回覆
    0
  • 取消回覆