Heim > Fragen und Antworten > Hauptteil
Ich muss derzeit ein webbasiertes System erstellen, das eine CSV-Datei mit einer Liste von URLs hochladen kann. Nach dem Hochladen liest das System die URL Zeile für Zeile und wird für den nächsten Crawling-Schritt verwendet. Hier erfordert das Crawlen, dass man sich vor dem Crawlen auf der Website anmeldet. Ich habe bereits den Quellcode für die Login-Website. Das Problem besteht jedoch darin, dass ich eine HTML-Seite namens „upload_page.html“ mit einer Flask-Datei namens „upload_csv.py“ verbinden möchte. Wo soll der Quellcode für Login und Scraping in der Flask-Datei abgelegt werden?
upload_page.html
<div class="upload"> <h2>Upload a CSV file</h2> <form action="/upload" method="post" enctype="multipart/form-data"> <input type="file" name="file" accept=".csv"> <br> <br> <button type="submit">Upload</button> </form> </div>
upload_csv.py
from flask import Flask, request, render_template import pandas as pd import requests from bs4 import BeautifulSoup import csv import json import time from time import sleep from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options app = Flask(__name__) @app.route('/') def index(): return render_template('upload_page.html') #Code for Login to the website @app.route('/upload', methods=['POST']) def upload(): # Read the uploaded file csv_file = request.files['file'] # Load the CSV data into a DataFrameSS df = pd.read_csv(csv_file) final_data = [] # Loop over the rows in the DataFrame and scrape each link for index, row in df.iterrows(): link = row['Link'] response = requests.get(link) soup = BeautifulSoup(response.content, 'html.parser') start = time.time() # will be used in the while loop initialScroll = 0 finalScroll = 1000 while True: driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})") # this command scrolls the window starting from the pixel value stored in the initialScroll # variable to the pixel value stored at the finalScroll variable initialScroll = finalScroll finalScroll += 1000 # we will stop the script for 3 seconds so that the data can load time.sleep(2) end = time.time() # We will scroll for 20 seconds. if round(end - start) > 20: break src = driver.page_source soup = BeautifulSoup(driver.page_source, 'html.parser') # print(soup.prettify()) #Code to do scrape the website return render_template('index.html', message='Scraped all data') if __name__ == '__main__': app.run(debug=True)
Sind mein Login- und Scraping-Code an der richtigen Stelle? Allerdings funktioniert die Kodierung nicht und nachdem ich auf den Upload-Button geklickt habe, wird sie nicht verarbeitet
P粉2079697872023-09-08 00:06:27
csv_file = request.files['file'] # Load the CSV data into a DataFrame df = pd.read_csv(csv_file) final_data = [] # Initialize the web driver chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") driver = webdriver.Chrome(options=chrome_options) # Loop over the rows in the DataFrame and scrape each link for index, row in df.iterrows(): link = row['Link'] # Login to the website # Replace this with your own login code driver.get("https://example.com/login") username_field = driver.find_element_by_name("username") password_field = driver.find_element_by_name("password") username_field.send_keys("myusername") password_field.send_keys("mypassword") password_field.send_keys(Keys.RETURN) # Wait for the login to complete WebDriverWait(driver, 10).until(EC.url_changes("https://example.com/login")) # Scrape the website driver.get(link) start = time.time() # will be used in the while loop initialScroll = 0 finalScroll = 1000 while True: driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})") # this command scrolls the window starting from the pixel value stored in the initialScroll # variable to the pixel value stored at the finalScroll variable initialScroll = finalScroll finalScroll += 1000 # we will stop the script for 3 seconds so that the data can load time.sleep(2) end = time.time() # We will scroll for 20 seconds. if round(end - start) > 20: break