search

Home  >  Q&A  >  body text

python - seleium crawls web page data and is only afraid of the current page. If I enter two pages, the initial page data will be downloaded twice.

import requests
from lxml import html,etree
from selenium import webdriver
import time, json

#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'


url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main 

'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
    i = int(1)
    while True:
        name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
        price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
        if i == 60:
            break
        else:
            i += 1
        name = selctor.xpath(name_string)[0]
        name_data.append(name)
        price = selctor.xpath(price_string)[0]
        price_data.append(price)
        jd_goods_data[name] = price

        print(name_data)
        with open(file_name, 'w') as f:
            json.dump(jd_goods_data, f)
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[10]').click()
    time.sleep(2)
    
    # for k, v in jd_goods_data.items():
    #     print(k,v)
    # with open(file_name, 'w') as f:
    #     json.dump(jd_goods_data, f)
PHPzPHPz2756 days ago633

reply all(1)I'll reply

  • 黄舟

    黄舟2017-05-18 10:54:18

    import requests
    from lxml import html,etree
    from selenium import webdriver
    import time, json
    
    #how many page do you want to scan
    page_numnotint = input("how many page do you want to scan")
    page_num = int(page_numnotint)
    file_name = 'jd_goods_data.json'
    
    driver = webdriver.Chrome()
    date_info = []
    name_data, price_data = [], []
    jd_goods_data = {}
    for q in range(page_num):
        url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
        driver.get(url)
        base_html = driver.page_source
        selctor = etree.HTML(base_html)
        i = 1
        while True:
            name_string = '//*[@id="plist"]/ul/li[%d]/p/p[3]/a/em/text()' %(i)
            price_string = '//*[@id="plist"]/ul/li[%d]/p/p[2]/strong[1]/i/text()' %(i)
            if i == 60:
                break
            else:
                i += 1
            name = selctor.xpath(name_string)[0]
            name_data.append(name)
            price = selctor.xpath(price_string)[0]
            price_data.append(price)
            jd_goods_data[name] = price
    
            print(name_data)
    
    with open(file_name, 'w') as f:
        json.dump(jd_goods_data, f)
    
    driver.quit()

    reply
    0
  • Cancelreply