Rumah > Soal Jawab > teks badan
scrapy只能爬取一个页面上的链接,不能持续运行爬完全站,以下是代码,初学求指导。
class DbbookSpider(scrapy.Spider):
name = "imufe"
allowed_domains = ['http://www.imufe.edu.cn/']
start_urls=('http://www.imufe.edu.cn/main/dtxw/201704/t20170414_127035.html')
def parse(self, response):
item = DoubanbookItem()
selector = scrapy.Selector(response)
print(selector)
books = selector.xpath('//a/@href').extract()
link=[]
for each in books:
each=urljoin(response.url,each)
link.append(each)
for each in link:
item['link'] = each
yield item
i = random.randint(0,len(link)-1)
nextPage = link[i]
yield scrapy.http.Request(nextPage,callback=self.parse)