我正在尝试从网站(https://carone.com.uy/autos-usados-y-0km?p=21)中提取几个值。有些工作正常,但有些不工作。例如,我能够提取名称、型号、价格和燃料类型,但无法正确提取“年份”或“公里数”字段,代码始终返回“N/A”作为值。
这是我正在使用的代码:
import pandas as pd from datetime import date import os import socket import requests from bs4 import BeautifulSoup def scrape_product_data(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] # Make the request to get the HTML content response = requests.get(url, headers=headers) response.raise_for_status() # Check if the request was successful soup = BeautifulSoup(response.text, 'html.parser') product_elements = soup.find_all('div', class_='product-item-info') for product_element in product_elements: # Extract product name, price, model, and attributes as before (same code as previous version) product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer') product_name = product_name_element.text.strip() if product_name_element else "N/A" product_price_element = product_element.find('span', class_='price') product_price = product_price_element.text.strip() if product_price_element else "N/A" product_model_element = product_element.select_one('p.carone-car-info-data-model') product_model = product_model_element.get('title').strip() if product_model_element else "N/A" # Extract product attributes attributes_div = product_element.find('div', class_='carone-car-attributes') year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año') year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A" kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros') kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A" fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible') fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A" # Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))
结果看起来像这样:enter image description here
我不明白为什么提到的值总是得到“N/A”,而其他的工作正常,方法是相同的。
P粉7594574202023-09-20 10:59:00
问题是,该网站在元素的文本中使用的不是Kilómetros
,而是Kilómetros
(年龄也是同样的情况):
def scrape_product_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") product_elements = soup.find_all("div", class_="product-item-info") for product_element in product_elements: product_name_element = product_element.select_one( "p.carone-car-info-data-brand.cursor-pointer" ) product_name = ( product_name_element.text.strip() if product_name_element else "N/A" ) product_price_element = product_element.find("span", class_="price") product_price = ( product_price_element.text.strip() if product_price_element else "N/A" ) product_model_element = product_element.select_one( "p.carone-car-info-data-model" ) product_model = ( product_model_element.get("title").strip() if product_model_element else "N/A" ) attributes_div = product_element.find("div", class_="carone-car-attributes") year_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Año" ) year_value = ( year_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if year_element else "N/A" ) kilometers_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Kilómetros" ) kilometers_value = ( kilometers_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if kilometers_element else "N/A" ) fuel_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Combustible" ) fuel_value = ( fuel_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if fuel_element else "N/A" ) product_data.append( ( product_name, product_price, product_model, year_value, kilometers_value, fuel_value, ) ) return pd.DataFrame( product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"] ) df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2") print(df)
打印结果:
Name Price Model Year KM Fuel 0 Renault Kwid US.000 KWID 1.0 INTENSE TACTIL 2018 82.390 NAFTA 1 Chevrolet Onix US.800 NEW ONIX 1.0T RS MT 2021 46.000 NAFTA 2 Suzuki Swift US.800 NUEVO SWIFT 1.2 GL AT 2020 63.641 NAFTA 3 Fiat Toro US.800 TORO 1.8 FREEDOM DC MT 2021 15.330 NAFTA 4 Renault Oroch US.300 NEW OROCH INTENS OUTSIDER 1.3T DC AT 2023 21.360 NAFTA 5 Renault Stepway US.100 STEPWAY PRIVILEGE 1.6 2017 60.010 NAFTA 6 Renault Kwid US.100 KWID 1.0 LIFE 2022 14 NAFTA 7 Chevrolet Onix US.800 NEW ONIX 1.0T PREMIER AT 2021 14.780 NAFTA 8 Nissan SENTRA B18 US.000 SENTRA B18 2.0 EXCLUSIVE AT 2022 30.430 NAFTA 9 Renault Kwid US.500 KWID 1.0 INTENSE MT 2020 37.660 NAFTA 10 Chevrolet Tracker US.300 TRACKER 1.8 LTZ 4X4 AT 2014 91.689 NAFTA 11 Chevrolet Onix US.600 NEW ONIX PLUS 1.2 LS 4P MT 2022 24.658 NAFTA