我在试着爬取CNKI的这个表格的部分字段,然后这个表格的作者列中,有些是含有多个名字,所以就是多条标签。然后我抓取下来的名称和作者就对不上,多个作者的标签会换行,那么我应该如何做才能写入csv时完成这种一条文献对应多个作者名字。
Python2
# coding:utf-8
import re
import csv
import codecs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import warnings
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
warnings.filterwarnings("ignore")
driver=webdriver.Firefox()
driver.get("http://epub.cnki.net/kns/brief/result.aspx?dbprefix=scdb&action=scdbsearch&db_opt=SCDB")
driver.find_element_by_link_text(u'专业检索').click()
time.sleep(3)
driver.find_element_by_css_selector("textarea[id=\"expertvalue\"]").clear()
driver.find_element_by_css_selector("textarea[id=\"expertvalue\"]").send_keys(u"TI='生态'")
driver.find_element_by_id("btnSearch").click()
driver.switch_to_frame("iframeResult")
page=driver.page_source
note=BeautifulSoup(page)
f = open('tabletable.csv', 'wb')
f.write(codecs.BOM_UTF8)
urlitems=note.findAll(name="a", attrs={"class":"fz14"})
nameitems=note.findAll(name="a", attrs={"target":"knet"})
textitems=note.findAll(name="a", attrs={"class":"KnowledgeNetLink","target":"_blank"})
header=['url','name','writer','choose']
writer = csv.writer(f,delimiter=',')
writer.writerow(header)
csvrow1=[]
csvrow2=[]
csvrow3=[]
csvrow4=[]
for urlinks in urlitems:
csvrow1.append('http://www.cnki.net' + urlinks.get('href'))
for names in urlitems:
csvrow2.append(names.get_text())
for writers in nameitems:
csvrow3.append(writers.get_text())
for chooses in textitems:
csvrow4.append(chooses.get_text())
for i in zip(csvrow1, csvrow2, csvrow3, csvrow4):
f.write(i[0] + ',' + i[1] + ',' + i[2] + ',' + i[3] + '\n')
f.close()
PHP中文网2017-04-18 10:11:16
It feels like you are messing around. You want to make the data you get look like this:
[
{'url': 'x', 'name': 'a'},
{'url': 'x', 'name': '李海舰; 田跃新; 李文杰'}
]