検索

ホームページ  >  に質問  >  本文

python - scrapy 接受用户参数,

需要接受指定爬去的起始url。不知道怎么接受参数合适。
但是如下这样的代码会报错。

File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
        cb_res = callback(response, **cb_kwargs) or ()
    exceptions.TypeError: 'str' object is not callable
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
import os
class dmozspider(CrawlSpider):
    name = 'dmoz'
    def __init__(self,starturl='www.wooyun.org'):
        self.start_urls = ['http://'+starturl]
        self.allowed_domains = [starturl]
        self._rules = (
        Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
    # start_urls=['http://www.wooyun.org']
    # allowed_domains=['www.wooyun.org']
    # rules=(
    #     Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)

    def parse_item(self, response):
        print response.url
        results = open('url.txt','a')
        results.write(response.url+ os.linesep)
        return True
        
        
        
        

搞定了,这样就可以。you probably redefined init without calling super

class dmozspider(CrawlSpider):
    name = 'dmoz'
    rules = (
        Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
    # start_urls=['http://www.wooyun.org']
    def __init__(self,starturl='www.wooyun.org',*args, **kwargs):
        super(dmozspider, self).__init__(*args, **kwargs)
        self.start_urls = ['http://'+starturl]
        self.allowed_domains = [starturl]
高洛峰高洛峰2767日前574

全員に返信(0)返信します

応答なし
  • キャンセル返事