Rumah > Soal Jawab > teks badan
Saya cuba mengekstrak beberapa nilai daripada tapak web (https://carone.com.uy/autos-usados-y-0km?p=21). Ada yang berfungsi dengan baik, tetapi ada yang tidak. Contohnya, saya dapat mengekstrak nama, model, harga dan jenis bahan api, tetapi tidak dapat mengekstrak medan "Tahun" atau "Kilometer" dengan betul, kod itu sentiasa mengembalikan "N/A" sebagai nilai.
Ini adalah kod yang saya gunakan:
import pandas as pd from datetime import date import os import socket import requests from bs4 import BeautifulSoup def scrape_product_data(url): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] # Make the request to get the HTML content response = requests.get(url, headers=headers) response.raise_for_status() # Check if the request was successful soup = BeautifulSoup(response.text, 'html.parser') product_elements = soup.find_all('div', class_='product-item-info') for product_element in product_elements: # Extract product name, price, model, and attributes as before (same code as previous version) product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer') product_name = product_name_element.text.strip() if product_name_element else "N/A" product_price_element = product_element.find('span', class_='price') product_price = product_price_element.text.strip() if product_price_element else "N/A" product_model_element = product_element.select_one('p.carone-car-info-data-model') product_model = product_model_element.get('title').strip() if product_model_element else "N/A" # Extract product attributes attributes_div = product_element.find('div', class_='carone-car-attributes') year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año') year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A" kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros') kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A" fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible') fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A" # Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))
Hasilnya kelihatan seperti ini: masukkan penerangan imej di sini
Saya tidak faham mengapa nilai yang disebutkan sentiasa mendapat "N/A" manakala yang lain berfungsi dengan baik, kaedahnya adalah sama.
P粉7594574202023-09-20 10:59:00
Masalahnya ialah, laman web menggunakan sesuatu selain daripada Kilómetros
,而是Kilómetros
dalam teks elemen (perkara yang sama untuk umur):
def scrape_product_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") product_elements = soup.find_all("div", class_="product-item-info") for product_element in product_elements: product_name_element = product_element.select_one( "p.carone-car-info-data-brand.cursor-pointer" ) product_name = ( product_name_element.text.strip() if product_name_element else "N/A" ) product_price_element = product_element.find("span", class_="price") product_price = ( product_price_element.text.strip() if product_price_element else "N/A" ) product_model_element = product_element.select_one( "p.carone-car-info-data-model" ) product_model = ( product_model_element.get("title").strip() if product_model_element else "N/A" ) attributes_div = product_element.find("div", class_="carone-car-attributes") year_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Año" ) year_value = ( year_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if year_element else "N/A" ) kilometers_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Kilómetros" ) kilometers_value = ( kilometers_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if kilometers_element else "N/A" ) fuel_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Combustible" ) fuel_value = ( fuel_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if fuel_element else "N/A" ) product_data.append( ( product_name, product_price, product_model, year_value, kilometers_value, fuel_value, ) ) return pd.DataFrame( product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"] ) df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2") print(df)
Cetak hasil:
Name Price Model Year KM Fuel 0 Renault Kwid US.000 KWID 1.0 INTENSE TACTIL 2018 82.390 NAFTA 1 Chevrolet Onix US.800 NEW ONIX 1.0T RS MT 2021 46.000 NAFTA 2 Suzuki Swift US.800 NUEVO SWIFT 1.2 GL AT 2020 63.641 NAFTA 3 Fiat Toro US.800 TORO 1.8 FREEDOM DC MT 2021 15.330 NAFTA 4 Renault Oroch US.300 NEW OROCH INTENS OUTSIDER 1.3T DC AT 2023 21.360 NAFTA 5 Renault Stepway US.100 STEPWAY PRIVILEGE 1.6 2017 60.010 NAFTA 6 Renault Kwid US.100 KWID 1.0 LIFE 2022 14 NAFTA 7 Chevrolet Onix US.800 NEW ONIX 1.0T PREMIER AT 2021 14.780 NAFTA 8 Nissan SENTRA B18 US.000 SENTRA B18 2.0 EXCLUSIVE AT 2022 30.430 NAFTA 9 Renault Kwid US.500 KWID 1.0 INTENSE MT 2020 37.660 NAFTA 10 Chevrolet Tracker US.300 TRACKER 1.8 LTZ 4X4 AT 2014 91.689 NAFTA 11 Chevrolet Onix US.600 NEW ONIX PLUS 1.2 LS 4P MT 2022 24.658 NAFTA