I'm trying to extract several values from the website (https://carone.com.uy/autos-usados-y-0km?p=21). Some work fine, but some don't. For example, I am able to extract the name, model, price and fuel type, but cannot correctly extract the "Year" or "Kilometers" fields, the code always returns "N/A" as the value.
This is the code I'm using:
import pandas as pd from datetime import date import os import socket import requests from bs4 import BeautifulSoup def scrape_product_data(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] # Make the request to get the HTML content response = requests.get(url, headers=headers) response.raise_for_status() # Check if the request was successful soup = BeautifulSoup(response.text, 'html.parser') product_elements = soup.find_all('div', class_='product-item-info') for product_element in product_elements: # Extract product name, price, model, and attributes as before (same code as previous version) product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer') product_name = product_name_element.text.strip() if product_name_element else "N/A" product_price_element = product_element.find('span', class_='price') product_price = product_price_element.text.strip() if product_price_element else "N/A" product_model_element = product_element.select_one('p.carone-car-info-data-model') product_model = product_model_element.get('title').strip() if product_model_element else "N/A" # Extract product attributes attributes_div = product_element.find('div', class_='carone-car-attributes') year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año') year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A" kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros') kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A" fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible') fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A" # Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))
The result looks like this: enter image description here
I don't understand why the mentioned value always gets "N/A" while the other ones work fine and the method is the same.
P粉7594574202023-09-20 10:59:00
The problem is that instead of Kilómetros
, the site uses Kilómetros
in the text of the element (the same goes for age):
def scrape_product_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") product_elements = soup.find_all("div", class_="product-item-info") for product_element in product_elements: product_name_element = product_element.select_one( "p.carone-car-info-data-brand.cursor-pointer" ) product_name = ( product_name_element.text.strip() if product_name_element else "N/A" ) product_price_element = product_element.find("span", class_="price") product_price = ( product_price_element.text.strip() if product_price_element else "N/A" ) product_model_element = product_element.select_one( "p.carone-car-info-data-model" ) product_model = ( product_model_element.get("title").strip() if product_model_element else "N/A" ) attributes_div = product_element.find("div", class_="carone-car-attributes") year_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Año" ) year_value = ( year_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if year_element else "N/A" ) kilometers_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Kilómetros" ) kilometers_value = ( kilometers_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if kilometers_element else "N/A" ) fuel_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Combustible" ) fuel_value = ( fuel_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if fuel_element else "N/A" ) product_data.append( ( product_name, product_price, product_model, year_value, kilometers_value, fuel_value, ) ) return pd.DataFrame( product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"] ) df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2") print(df)
Print results:
Name Price Model Year KM Fuel 0 Renault Kwid US.000 KWID 1.0 INTENSE TACTIL 2018 82.390 NAFTA 1 Chevrolet Onix US.800 NEW ONIX 1.0T RS MT 2021 46.000 NAFTA 2 Suzuki Swift US.800 NUEVO SWIFT 1.2 GL AT 2020 63.641 NAFTA 3 Fiat Toro US.800 TORO 1.8 FREEDOM DC MT 2021 15.330 NAFTA 4 Renault Oroch US.300 NEW OROCH INTENS OUTSIDER 1.3T DC AT 2023 21.360 NAFTA 5 Renault Stepway US.100 STEPWAY PRIVILEGE 1.6 2017 60.010 NAFTA 6 Renault Kwid US.100 KWID 1.0 LIFE 2022 14 NAFTA 7 Chevrolet Onix US.800 NEW ONIX 1.0T PREMIER AT 2021 14.780 NAFTA 8 Nissan SENTRA B18 US.000 SENTRA B18 2.0 EXCLUSIVE AT 2022 30.430 NAFTA 9 Renault Kwid US.500 KWID 1.0 INTENSE MT 2020 37.660 NAFTA 10 Chevrolet Tracker US.300 TRACKER 1.8 LTZ 4X4 AT 2014 91.689 NAFTA 11 Chevrolet Onix US.600 NEW ONIX PLUS 1.2 LS 4P MT 2022 24.658 NAFTA