suchen

Heim  >  Fragen und Antworten  >  Hauptteil

Python - Seleium crawlt Webseitendaten und hat nur Angst vor der aktuellen Seite. Wenn ich zwei Seiten betrete, werden die anfänglichen Seitendaten zweimal heruntergeladen.

import requests
from lxml import html,etree
from selenium import webdriver
import time, json

#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'


url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main 

'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
    i = int(1)
    while True:
        name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
        price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
        if i == 60:
            break
        else:
            i += 1
        name = selctor.xpath(name_string)[0]
        name_data.append(name)
        price = selctor.xpath(price_string)[0]
        price_data.append(price)
        jd_goods_data[name] = price

        print(name_data)
        with open(file_name, 'w') as f:
            json.dump(jd_goods_data, f)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[10]').click()
    time.sleep(2)
    
    # for k, v in jd_goods_data.items():
    #     print(k,v)
    # with open(file_name, 'w') as f:
    #     json.dump(jd_goods_data, f)
PHPzPHPz2755 Tage vor628

Antworte allen(1)Ich werde antworten

  • 黄舟

    黄舟2017-05-18 10:54:18

    import requests
    from lxml import html,etree
    from selenium import webdriver
    import time, json
    
    #how many page do you want to scan
    page_numnotint = input("how many page do you want to scan")
    page_num = int(page_numnotint)
    file_name = 'jd_goods_data.json'
    
    driver = webdriver.Chrome()
    date_info = []
    name_data, price_data = [], []
    jd_goods_data = {}
    for q in range(page_num):
        url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
        driver.get(url)
        base_html = driver.page_source
        selctor = etree.HTML(base_html)
        i = 1
        while True:
            name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
            price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
            if i == 60:
                break
            else:
                i += 1
            name = selctor.xpath(name_string)[0]
            name_data.append(name)
            price = selctor.xpath(price_string)[0]
            price_data.append(price)
            jd_goods_data[name] = price
    
            print(name_data)
    
    with open(file_name, 'w') as f:
        json.dump(jd_goods_data, f)
    
    driver.quit()

    Antwort
    0
  • StornierenAntwort