我正在嘗試從網站(https://carone.com.uy/autos-usados-y-0km?p=21)中提取幾個值。有些工作正常,但有些不工作。例如,我能夠提取名稱、型號、價格和燃料類型,但無法正確提取“年份”或“公里數”字段,代碼始終返回“N/A”作為值。
這是我正在使用的程式碼:
import pandas as pd from datetime import date import os import socket import requests from bs4 import BeautifulSoup def scrape_product_data(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] # Make the request to get the HTML content response = requests.get(url, headers=headers) response.raise_for_status() # Check if the request was successful soup = BeautifulSoup(response.text, 'html.parser') product_elements = soup.find_all('div', class_='product-item-info') for product_element in product_elements: # Extract product name, price, model, and attributes as before (same code as previous version) product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer') product_name = product_name_element.text.strip() if product_name_element else "N/A" product_price_element = product_element.find('span', class_='price') product_price = product_price_element.text.strip() if product_price_element else "N/A" product_model_element = product_element.select_one('p.carone-car-info-data-model') product_model = product_model_element.get('title').strip() if product_model_element else "N/A" # Extract product attributes attributes_div = product_element.find('div', class_='carone-car-attributes') year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año') year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A" kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros') kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A" fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible') fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A" # Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))
結果看起來像這樣:enter image description here
我不明白為什麼提到的值總是得到“N/A”,而其他的工作正常,方法是相同的。
P粉7594574202023-09-20 10:59:00
問題是,該網站在元素的文本中使用的不是Kilómetros
,而是Kilómetros
(年齡也是同樣的情況):
def scrape_product_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") product_elements = soup.find_all("div", class_="product-item-info") for product_element in product_elements: product_name_element = product_element.select_one( "p.carone-car-info-data-brand.cursor-pointer" ) product_name = ( product_name_element.text.strip() if product_name_element else "N/A" ) product_price_element = product_element.find("span", class_="price") product_price = ( product_price_element.text.strip() if product_price_element else "N/A" ) product_model_element = product_element.select_one( "p.carone-car-info-data-model" ) product_model = ( product_model_element.get("title").strip() if product_model_element else "N/A" ) attributes_div = product_element.find("div", class_="carone-car-attributes") year_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Año" ) year_value = ( year_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if year_element else "N/A" ) kilometers_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Kilómetros" ) kilometers_value = ( kilometers_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if kilometers_element else "N/A" ) fuel_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Combustible" ) fuel_value = ( fuel_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if fuel_element else "N/A" ) product_data.append( ( product_name, product_price, product_model, year_value, kilometers_value, fuel_value, ) ) return pd.DataFrame( product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"] ) df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2") print(df)
列印結果:
Name Price Model Year KM Fuel 0 Renault Kwid US.000 KWID 1.0 INTENSE TACTIL 2018 82.390 NAFTA 1 Chevrolet Onix US.800 NEW ONIX 1.0T RS MT 2021 46.000 NAFTA 2 Suzuki Swift US.800 NUEVO SWIFT 1.2 GL AT 2020 63.641 NAFTA 3 Fiat Toro US.800 TORO 1.8 FREEDOM DC MT 2021 15.330 NAFTA 4 Renault Oroch US.300 NEW OROCH INTENS OUTSIDER 1.3T DC AT 2023 21.360 NAFTA 5 Renault Stepway US.100 STEPWAY PRIVILEGE 1.6 2017 60.010 NAFTA 6 Renault Kwid US.100 KWID 1.0 LIFE 2022 14 NAFTA 7 Chevrolet Onix US.800 NEW ONIX 1.0T PREMIER AT 2021 14.780 NAFTA 8 Nissan SENTRA B18 US.000 SENTRA B18 2.0 EXCLUSIVE AT 2022 30.430 NAFTA 9 Renault Kwid US.500 KWID 1.0 INTENSE MT 2020 37.660 NAFTA 10 Chevrolet Tracker US.300 TRACKER 1.8 LTZ 4X4 AT 2014 91.689 NAFTA 11 Chevrolet Onix US.600 NEW ONIX PLUS 1.2 LS 4P MT 2022 24.658 NAFTA