Rumah  >  Soal Jawab  >  teks badan

python - seleium merangkak data halaman web dan hanya takut dengan halaman semasa Jika saya memasukkan dua halaman, data halaman awal akan dimuat turun dua kali.

import requests
from lxml import html,etree
from selenium import webdriver
import time, json

#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'


url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main 

'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
    i = int(1)
    while True:
        name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
        price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
        if i == 60:
            break
        else:
            i += 1
        name = selctor.xpath(name_string)[0]
        name_data.append(name)
        price = selctor.xpath(price_string)[0]
        price_data.append(price)
        jd_goods_data[name] = price

        print(name_data)
        with open(file_name, 'w') as f:
            json.dump(jd_goods_data, f)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[10]').click()
    time.sleep(2)
    
    # for k, v in jd_goods_data.items():
    #     print(k,v)
    # with open(file_name, 'w') as f:
    #     json.dump(jd_goods_data, f)
PHPzPHPz2712 hari yang lalu597

membalas semua(1)saya akan balas

  • 黄舟

    黄舟2017-05-18 10:54:18

    import requests
    from lxml import html,etree
    from selenium import webdriver
    import time, json
    
    #how many page do you want to scan
    page_numnotint = input("how many page do you want to scan")
    page_num = int(page_numnotint)
    file_name = 'jd_goods_data.json'
    
    driver = webdriver.Chrome()
    date_info = []
    name_data, price_data = [], []
    jd_goods_data = {}
    for q in range(page_num):
        url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
        driver.get(url)
        base_html = driver.page_source
        selctor = etree.HTML(base_html)
        i = 1
        while True:
            name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
            price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
            if i == 60:
                break
            else:
                i += 1
            name = selctor.xpath(name_string)[0]
            name_data.append(name)
            price = selctor.xpath(price_string)[0]
            price_data.append(price)
            jd_goods_data[name] = price
    
            print(name_data)
    
    with open(file_name, 'w') as f:
        json.dump(jd_goods_data, f)
    
    driver.quit()

    balas
    0
  • Batalbalas