Home >Backend Development >Python Tutorial >Scrapy tutorial--crawling the first N articles of a website
2) Analyze the page structure: Each TD is a person.
The first small is the ranking
The second a tag is the nickname and username, as well as the blog address of the homepage. The username is obtained by intercepting the address
The fourth small tag is the number of blogs and points, which can be obtained one by one after string separation.
3) Code: Use xpath to obtain tags and related content. After obtaining the homepage blog address, send a request.
def parse(self, response):
for i in response.xpath("//table[@width='90%']//td"):
item = CnblogsItem()
2] .strip ()
item ['nickname'] = i.xpath ("./ a [1] // text ()"). Extract () [0] .Strip ()#item [ 'userName'] = i.xpath(
"./a[1]/@href") totalAndScore = i.xpath(
item['score'] = totalAndScore[2].strip()
# print(top)
# print(nickName)
# print(userName)
# print(total)
# Print (score)
# Return
Yield scrapy.request (i.xpath ("./ a [1]/@href"). Extract () [0], meta = {'page ': 1, 'item': item},
def parse_page(self, response):
# print(response.meta['nickName'])
urlArr = response.url.split('default.aspx?')
if len(urlArr) > 1:
baseUrl = urlArr[-2]
baseUrl = response.url
list = response.xpath("//a[contains(@id,'TitleUrl')]")
for i in list:
item = CnblogsItem()
item['top'] = int(response.meta['item']['top'])
item['nickName'] = response.meta['item']['nickName']
item['userName'] = response.meta['item']['userName']
item['score'] = int(response.meta['item']['score'])
item['pageLink'] = response.url
item['title'] = i.xpath(
"./text()").extract()[0].replace(u'[置顶]', '').replace('[Top]', '').strip()
item['articleLink'] = i.xpath("./@href").extract()[0]
yield scrapy.Request(i.xpath("./@href").extract()[0], meta={'item': item}, callback=self.parse_content)
if len(list) > 0:
response.meta['page'] += 1
yield scrapy.Request(baseUrl + 'default.aspx?page=' + str(response.meta['page']), meta={'page': response.meta['page'], 'item': response.meta['item']}, callback=self.parse_page)
def parse_content(self, response): content = response.xpath("//div[@id='cnblogs_post_body']").extract() item = response.meta['item']if len(content) == 0: item['content'] = u'该文章已加密'else: item['content'] = content[0]yield item
这一部分没什么难的。记着安装pymongo,pip install pymongo。总共有80+万篇文章。
from cnblogs.items import CnblogsItemimport pymongoclass CnblogsPipeline(object):def __init__(self): client = pymongo.MongoClient(host='', port=27017) dbName = client['cnblogs'] self.table = dbName['articles'] self.table.createdef process_item(self, item, spider):if isinstance(item, CnblogsItem): self.table.insert(dict(item))return item
def process_request(self, request, spider): request.meta['proxy'] = ''
class CnblogsItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# 排名top = scrapy.Field() nickName = scrapy.Field() userName = scrapy.Field()# 积分score = scrapy.Field()# 所在页码地址pageLink = scrapy.Field()# 文章标题title = scrapy.Field()# 文章链接articleLink = scrapy.Field()
# 文章内容
content = scrapy.Field()
# coding=utf-8import sysimport jiebafrom wordcloud import WordCloudimport pymongoimport threadingfrom Queue import Queueimport datetimeimport os reload(sys) sys.setdefaultencoding('utf-8')class MyThread(threading.Thread):def __init__(self, func, args): threading.Thread.__init__(self) self.func = func self.args = argsdef run(self): apply(self.func, self.args)# 获取内容 线程def getTitle(queue, table):for j in range(1, 3001):# start = datetime.datetime.now()list = table.find({'top': j}, {'title': 1, 'top': 1, 'nickName': 1})if list.count() == 0:continuetxt = ''for i in list: txt += str(i['title']) + '\n'name = i['nickName'] top = i['top'] txt = ' '.join(jieba.cut(txt)) queue.put((txt, name, top), 1)# print((datetime.datetime.now() - start).seconds)def getImg(queue, word):for i in range(1, 3001):# start = datetime.datetime.now()get = queue.get(1) word.generate(get[0]) name = get[1].replace('<', '').replace('>', '').replace('/', '').replace('\\', '').replace('|', '').replace(':', '').replace('"', '').replace('*', '').replace('?', '') word.to_file('wordcloudimgs/' + str(get[2]) + '-' + str(name).decode('utf-8') + '.jpg')print(str(get[1]).decode('utf-8') + '\t生成成功')# print((datetime.datetime.now() - start).seconds)def main(): client = pymongo.MongoClient(host='', port=27017) dbName = client['cnblogs'] table = dbName['articles'] wc = WordCloud( font_path='msyh.ttc', background_color='#ccc', width=600, height=600)if not os.path.exists('wordcloudimgs'): os.mkdir('wordcloudimgs') threads = [] queue = Queue() titleThread = MyThread(getTitle, (queue, table)) imgThread = MyThread(getImg, (queue, wc)) threads.append(imgThread) threads.append(titleThread)for t in threads: t.start()for t in threads: t.join()if __name__ == "__main__": main()
The above is the detailed content of Scrapy tutorial--crawling the first N articles of a website. For more information, please follow other related articles on the PHP Chinese website!