cari

Rumah  >  Soal Jawab  >  teks badan

Masalahnya ialah fungsi find_previous_sibling tidak berfungsi dengan betul apabila menggunakan BeautifulSoup untuk merangkak web

Saya cuba mengekstrak beberapa nilai daripada tapak web (https://carone.com.uy/autos-usados-y-0km?p=21). Ada yang berfungsi dengan baik, tetapi ada yang tidak. Contohnya, saya dapat mengekstrak nama, model, harga dan jenis bahan api, tetapi tidak dapat mengekstrak medan "Tahun" atau "Kilometer" dengan betul, kod itu sentiasa mengembalikan "N/A" sebagai nilai.

Ini adalah kod yang saya gunakan:

import pandas as pd
from datetime import date
import os
import socket
import requests
from bs4 import BeautifulSoup

def scrape_product_data(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }

        product_data = []

        # Make the request to get the HTML content
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check if the request was successful

        soup = BeautifulSoup(response.text, 'html.parser')
        product_elements = soup.find_all('div', class_='product-item-info')
        for product_element in product_elements:
            # Extract product name, price, model, and attributes as before (same code as previous version)
            product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer')
            product_name = product_name_element.text.strip() if product_name_element else "N/A"

            product_price_element = product_element.find('span', class_='price')
            product_price = product_price_element.text.strip() if product_price_element else "N/A"

            product_model_element = product_element.select_one('p.carone-car-info-data-model')
            product_model = product_model_element.get('title').strip() if product_model_element else "N/A"

            # Extract product attributes
            attributes_div = product_element.find('div', class_='carone-car-attributes')
            
            year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año')
            year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A"

            kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros')
            kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A"

            fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible')
            fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A"

            # Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list
            product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))

Hasilnya kelihatan seperti ini: masukkan penerangan imej di sini

Saya tidak faham mengapa nilai yang disebutkan sentiasa mendapat "N/A" manakala yang lain berfungsi dengan baik, kaedahnya adalah sama.

P粉187160883P粉187160883489 hari yang lalu745

membalas semua(1)saya akan balas

  • P粉759457420

    P粉7594574202023-09-20 10:59:00

    Masalahnya ialah, laman web menggunakan sesuatu selain daripada Kilómetros,而是Kil&oacutemetros dalam teks elemen (perkara yang sama untuk umur):

    def scrape_product_data(url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }
    
        product_data = []
    
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    
        soup = BeautifulSoup(response.text, "html.parser")
        product_elements = soup.find_all("div", class_="product-item-info")
        for product_element in product_elements:
            product_name_element = product_element.select_one(
                "p.carone-car-info-data-brand.cursor-pointer"
            )
            product_name = (
                product_name_element.text.strip() if product_name_element else "N/A"
            )
    
            product_price_element = product_element.find("span", class_="price")
            product_price = (
                product_price_element.text.strip() if product_price_element else "N/A"
            )
    
            product_model_element = product_element.select_one(
                "p.carone-car-info-data-model"
            )
            product_model = (
                product_model_element.get("title").strip()
                if product_model_element
                else "N/A"
            )
    
            attributes_div = product_element.find("div", class_="carone-car-attributes")
    
            year_element = attributes_div.find(
                "p", class_="carone-car-attribute-title", string="Año"
            )
            year_value = (
                year_element.find_previous_sibling(
                    "p", class_="carone-car-attribute-value"
                ).text
                if year_element
                else "N/A"
            )
    
            kilometers_element = attributes_div.find(
                "p", class_="carone-car-attribute-title", string="Kilómetros"
            )
            kilometers_value = (
                kilometers_element.find_previous_sibling(
                    "p", class_="carone-car-attribute-value"
                ).text
                if kilometers_element
                else "N/A"
            )
    
            fuel_element = attributes_div.find(
                "p", class_="carone-car-attribute-title", string="Combustible"
            )
            fuel_value = (
                fuel_element.find_previous_sibling(
                    "p", class_="carone-car-attribute-value"
                ).text
                if fuel_element
                else "N/A"
            )
    
            product_data.append(
                (
                    product_name,
                    product_price,
                    product_model,
                    year_value,
                    kilometers_value,
                    fuel_value,
                )
            )
    
        return pd.DataFrame(
            product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"]
        )
    
    
    df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2")
    print(df)
    

    Cetak hasil:

                     Name      Price                                 Model  Year      KM   Fuel
    0        Renault Kwid  US.000               KWID 1.0 INTENSE TACTIL  2018  82.390  NAFTA
    1      Chevrolet Onix  US.800                   NEW ONIX 1.0T RS MT  2021  46.000  NAFTA
    2        Suzuki Swift  US.800                 NUEVO SWIFT 1.2 GL AT  2020  63.641  NAFTA
    3           Fiat Toro  US.800                TORO 1.8 FREEDOM DC MT  2021  15.330  NAFTA
    4       Renault Oroch  US.300  NEW OROCH INTENS OUTSIDER 1.3T DC AT  2023  21.360  NAFTA
    5     Renault Stepway  US.100                 STEPWAY PRIVILEGE 1.6  2017  60.010  NAFTA
    6        Renault Kwid  US.100                         KWID 1.0 LIFE  2022      14  NAFTA
    7      Chevrolet Onix  US.800              NEW ONIX 1.0T PREMIER AT  2021  14.780  NAFTA
    8   Nissan SENTRA B18  US.000           SENTRA B18 2.0 EXCLUSIVE AT  2022  30.430  NAFTA
    9        Renault Kwid  US.500                   KWID 1.0 INTENSE MT  2020  37.660  NAFTA
    10  Chevrolet Tracker  US.300                TRACKER 1.8 LTZ 4X4 AT  2014  91.689  NAFTA
    11     Chevrolet Onix  US.600            NEW ONIX PLUS 1.2 LS 4P MT  2022  24.658  NAFTA
    

    balas
    0
  • Batalbalas