Home  >  Q&A  >  body text

Upload a CSV file containing the URLs from the HTML page and use Flask to read the URLs you want to crawl

I currently need to make a web-based system that can upload a CSV file containing a list of URLs. After uploading, the system will read the URL line by line and will be used for the next step of crawling. Here, crawling requires logging into the website before crawling. I already have the source code for the login website. However, the problem is that I want to connect an html page named "upload_page.html" with a flask file named "upload_csv.py". Where should the source code for login and scraping be placed in the flask file?

upload_page.html

<div class="upload">
            <h2>Upload a CSV file</h2>
                <form action="/upload" method="post" enctype="multipart/form-data">
                 <input type="file" name="file" accept=".csv">
                 <br>
                 <br>
                 <button type="submit">Upload</button>
                </form>
</div>

upload_csv.py

from flask import Flask, request, render_template
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import json
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('upload_page.html')

#Code for Login to the website


@app.route('/upload', methods=['POST'])
def upload():
    # Read the uploaded file
    csv_file = request.files['file']
    # Load the CSV data into a DataFrameSS
    df = pd.read_csv(csv_file)
    final_data = []
    # Loop over the rows in the DataFrame and scrape each link
    for index, row in df.iterrows():
        link = row['Link']
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')
        start = time.time()
        # will be used in the while loop
        initialScroll = 0
        finalScroll = 1000

        while True:
            driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
            # this command scrolls the window starting from the pixel value stored in the initialScroll
            # variable to the pixel value stored at the finalScroll variable
            initialScroll = finalScroll
            finalScroll += 1000

            # we will stop the script for 3 seconds so that the data can load
            time.sleep(2)
            end = time.time()
            # We will scroll for 20 seconds.
            if round(end - start) > 20:
                break

        src = driver.page_source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # print(soup.prettify())

        #Code to do scrape the website

    return render_template('index.html', message='Scraped all data')


if __name__ == '__main__':
    app.run(debug=True)

Are my login and crawling codes in the correct location? However, the encoding is not working and after I click the upload button it is not being processed

P粉799885311P粉799885311408 days ago522

reply all(1)I'll reply

  • P粉207969787

    P粉2079697872023-09-08 00:06:27

    csv_file = request.files['file']
    # Load the CSV data into a DataFrame
    df = pd.read_csv(csv_file)
    final_data = []
    # Initialize the web driver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=chrome_options)
    # Loop over the rows in the DataFrame and scrape each link
    for index, row in df.iterrows():
        link = row['Link']
        # Login to the website
        # Replace this with your own login code
        driver.get("https://example.com/login")
        username_field = driver.find_element_by_name("username")
        password_field = driver.find_element_by_name("password")
        username_field.send_keys("myusername")
        password_field.send_keys("mypassword")
        password_field.send_keys(Keys.RETURN)
        # Wait for the login to complete
        WebDriverWait(driver, 10).until(EC.url_changes("https://example.com/login"))
        # Scrape the website
        driver.get(link)
        start = time.time()
        # will be used in the while loop
        initialScroll = 0
        finalScroll = 1000
    
        while True:
            driver.execute_script(f"window.scrollTo({initialScroll},{finalScroll})")
            # this command scrolls the window starting from the pixel value stored in the initialScroll
            # variable to the pixel value stored at the finalScroll variable
            initialScroll = finalScroll
            finalScroll += 1000
    
            # we will stop the script for 3 seconds so that the data can load
            time.sleep(2)
            end = time.time()
            # We will scroll for 20 seconds.
            if round(end - start) > 20:
                break

    reply
    0
  • Cancelreply