Maison  >  Questions et réponses  >  le corps du texte

python - Seleium explore les données de la page Web et n'a peur que de la page actuelle. Si j'entre deux pages, les données de la page initiale seront téléchargées deux fois.

import requests
from lxml import html,etree
from selenium import webdriver
import time, json

#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'


url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main 

'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
    i = int(1)
    while True:
        name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
        price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
        if i == 60:
            break
        else:
            i += 1
        name = selctor.xpath(name_string)[0]
        name_data.append(name)
        price = selctor.xpath(price_string)[0]
        price_data.append(price)
        jd_goods_data[name] = price

        print(name_data)
        with open(file_name, 'w') as f:
            json.dump(jd_goods_data, f)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[10]').click()
    time.sleep(2)
    
    # for k, v in jd_goods_data.items():
    #     print(k,v)
    # with open(file_name, 'w') as f:
    #     json.dump(jd_goods_data, f)
PHPzPHPz2712 Il y a quelques jours608

répondre à tous(1)je répondrai

  • 黄舟

    黄舟2017-05-18 10:54:18

    import requests
    from lxml import html,etree
    from selenium import webdriver
    import time, json
    
    #how many page do you want to scan
    page_numnotint = input("how many page do you want to scan")
    page_num = int(page_numnotint)
    file_name = 'jd_goods_data.json'
    
    driver = webdriver.Chrome()
    date_info = []
    name_data, price_data = [], []
    jd_goods_data = {}
    for q in range(page_num):
        url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
        driver.get(url)
        base_html = driver.page_source
        selctor = etree.HTML(base_html)
        i = 1
        while True:
            name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
            price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
            if i == 60:
                break
            else:
                i += 1
            name = selctor.xpath(name_string)[0]
            name_data.append(name)
            price = selctor.xpath(price_string)[0]
            price_data.append(price)
            jd_goods_data[name] = price
    
            print(name_data)
    
    with open(file_name, 'w') as f:
        json.dump(jd_goods_data, f)
    
    driver.quit()

    répondre
    0
  • Annulerrépondre