Heim  >  Fragen und Antworten  >  Hauptteil

python - scrapy抓取天猫重定向(302)问题

spider.py代码

# -*- coding: utf-8 -*-
import scrapy
from topgoods.items import TopgoodsItem

class TmGoodsSpider(scrapy.Spider):
    name = "tm_goods"
    allowed_domains = ["http://www.tmall.com"]
    start_urls = (
        'https://list.tmall.com/search_product.htm?q=%C5%AE%D7%B0&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton',
    )
    #记录处理的页数
    count=0 
     
    def parse(self, response):
          
        TmGoodsSpider.count += 1
        
        ps = response.xpath("//p[@id='J_ItemList']/p[@class='product']/p")
        if not ps:
            self.log( "List Page error--%s"%response.url )
              
        for p in ps:
            item=TopgoodsItem()
            #商品价格
            item["GOODS_PRICE"] = p.xpath("p[@class='productPrice']/em/@title")[0].extract()
            #商品名称
            item["GOODS_NAME"] = p.xpath("p[@class='productTitle']/a/@title")[0].extract()
            #商品连接
            pre_goods_url = p.xpath("p[@class='productTitle']/a/@href")[0].extract()
            item["GOODS_URL"] = pre_goods_url if "http:" in pre_goods_url else ("http:"+pre_goods_url)
            
            yield scrapy.Request(url=item["GOODS_URL"],meta={'item':item},callback=self.parse_detail,
            dont_filter=True)

    def parse_detail(self,response):

        p = response.xpath('//p[@class="extend"]/ul')
        if not p:
            self.log( "Detail Page error--%s"%response.url )
            
        item = response.meta['item']
        p=p[0]
        #店铺名称
        item["SHOP_NAME"] = p.xpath("li[1]/p/a/text()")[0].extract()
        #店铺连接
        item["SHOP_URL"] = p.xpath("li[1]/p/a/@href")[0].extract()
        #公司名称
        item["COMPANY_NAME"] = p.xpath("li[3]/p/text()")[0].extract().strip()
        #公司所在地
        item["COMPANY_ADDRESS"] = p.xpath("li[4]/p/text()")[0].extract().strip()
        
        yield item

结果:

10-15 19:20:06 [scrapy] DEBUG: Redirecting (302) to <GET https://login.taob
m/jump?target=https%3A%2F%2Flist.tmall.com%2Fsearch_product.htm%3Ftbpm%3D1%
D%25C5%25AE%25D7%25B0%26type%3Dp%26spm%3Da220m.1000858.a2227oh.d100%26from%
st.pc_1_searchbutton> from <GET https://list.tmall.com/search_product.htm?q
AE%D7%B0&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton

10-15 19:20:06 [scrapy] DEBUG: Redirecting (302) to <GET https://pass.tmall
add?_tb_token_=KL9DqtpQ4JXA&cookie2=fc1318de70224bfb4688cb59f2166e17&t=4d43
c2cda976f8ace84a7f74a08&target=https%3A%2F%2Flist.tmall.com%2Fsearch_produc
%3Ftbpm%3D1%26q%3D%25C5%25AE%25D7%25B0%26type%3Dp%26spm%3Da220m.1000858.a22
d100%26from%3D.list.pc_1_searchbutton&pacc=RRsp0ixWwD7auxG1xr9HDg==&opi=59.
.222&tmsc=1444908006341549> from <GET https://login.taobao.com/jump?target=
%3A%2F%2Flist.tmall.com%2Fsearch_product.htm%3Ftbpm%3D1%26q%3D%25C5%25AE%25
B0%26type%3Dp%26spm%3Da220m.1000858.a2227oh.d100%26from%3D.list.pc_1_search
n>

好像是网址转移了,需要重定向的问题,请问我要怎么改代码获得我想要的信息。

PHP中文网PHP中文网2741 Tage vor3058

Antworte allen(3)Ich werde antworten

  • 天蓬老师

    天蓬老师2017-04-17 16:09:09

    被跳转到登录页面了,天猫有防爬装置。你仔细研究下天猫detail域下的cookie,把cookie带上去访问吧。

    Antwort
    0
  • PHPz

    PHPz2017-04-17 16:09:09

    嗯,应该是防爬虫,你可以cookie带上试试。

    Antwort
    0
  • 阿神

    阿神2017-04-17 16:09:09

    解决了吗?我也遇到了同样的问题,不知道怎么添加cookie,
    看了视频,是这样添加的

    #-*- coding:utf-8 -*-
    import scrapy
    
    class StackOverflowSpider(scrapy.Spider):
        name = 'stackoverflow'
        start_urls = ['http://stackoverflow.com/questions?sort=votes']
        
        def start_requests(self):
            url = "http://db.bioon.com/list.php?channelid=1016&classid=951"
            cookies = {
                'dz_username':'wst_today',
                'dz_uid':'1322052',
                'buc_key':'ofR1I78RBaCHkGp8MdBBRjMx7ustawtY',
                'buc_token':'a91b8fef55c66846d3975a9fd8883455'
            }
            return [
                scrapy.Request(url,cookies=cookies),
            ]
        
        def parse(self, response):
            ele = response.xpath(
                '//table[@class="table table-striped"]/thead/tr/th[1]/text()'
                ).extract()
            if ele:
                print "success"

    但是换了天猫网站还是报错,不知道怎么写cookie变量

    Antwort
    0
  • StornierenAntwort