抓取小说网页,链接在代码中。数据处理为字典,写入txt文件中。不知道怎么处理编码问题,中文不能正确显示,折腾了整个晚上,还是不行,要死。
还有一个问题就是网页源代码中在<p>标签中经常插入<u>----</u>标签,搞到有些字抓取之后看不见,也求一并解答。
刚学python,望大神指点。首先感谢你宝贵的时间,谢谢。
# coding:utf-8
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def towrite(contentdict):
f.writelines(u'章:' + unicode(contentdict['title']) + '\n')
f.writelines(unicode(contentdict['content']) + '\n\n')
def spider(url):
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
html = requests.get(url,headers = head)
html.encoding = 'utf-8'
print html.text
seletor = etree.HTML(html.text)
title = seletor.xpath('//p[@id="f_title1"]/h1/text()')
content = seletor.xpath('//p[@id="f_content1"]/p/p/text()')
fullcontent = ''
for each in content:
fullcontent += each
# title.decode('gb2312').encode('utf-8')
# fullcontent.encode('utf-8')
item = {}
item['title'] = title
item['content'] = fullcontent
towrite(item)
if __name__ == '__main__':
pool = ThreadPool(4)
f = open('guduliujiang.txt','a')
page = []
for i in range(1,29):
newpage = 'http://www.sbkk8.cn/mingzhu/zhongguoxiandaiwenxuemingzhu/guduliujiang/' + str(145232-i) + '.html'
page.append(newpage)
results = pool.map(spider,page)
pool.close()
pool.join()
f.close()
高洛峰2017-04-18 09:08:20
Question 1:
Using print html.encoding shows that the encoding format is ISO-8859-1
print html.text.decode('ISO-8859-1') has no effect
Refer to the following code to print out Chinese characters in the terminal
html = requests.get(url,headers = head)
print html.encoding
#print html.headers['content-type']
print html.text.encode('latin-1').decode('gbk')
Question 2:
Take your novel webpage (Loneliness of Love (2)) as an example
content = seletor.xpath('//p[@id="f_content1"]')[0]
real_content=content.xpath('string(.)')
print real_content
Try to see if it’s the result you want
The following is the test code I made using your code:
#-*-coding:utf-8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def towrite(contentdict):
f.writelines(u'章:' + unicode(contentdict['title']) + '\n')
f.writelines(unicode(contentdict['content']) + '\n\n')
def spider(url):
head = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'}
html = requests.get(url,headers = head)
print html.encoding
print html.headers['content-type']
print html.text.encode('latin-1').decode('gbk')
context=html.text.encode('latin-1').decode('gbk')
seletor = etree.HTML(context)
# title = seletor.xpath('//p[@id="f_title1"]/h1/text()')
content = seletor.xpath('//p[@id="f_content1"]')[0]
real_content=content.xpath('string(.)')
print real_content
# fullcontent = ''
# for each in content:
# fullcontent += each
# title.decode('gb2312').encode('utf-8')
# fullcontent.encode('utf-8')
# item = {}
# item['title'] = title
# item['content'] = fullcontent
# towrite(item)
if __name__ == '__main__':
# pool = ThreadPool(4)
# f = open('guduliujiang.txt','a')
# page = []
# for i in range(1,29):
newpage = 'http://www.sbkk8.cn/mingzhu/zhongguoxiandaiwenxuemingzhu/guduliujiang/145229.html'
# page.append(newpage)
# results = pool.map(spider,page)
# pool.close()
# pool.join()
# f.close()
spider(newpage)