From a09846193d8780008f868dc8a008ffbd483c3511 Mon Sep 17 00:00:00 2001 From: Mara Karagianni Date: Thu, 13 Mar 2025 18:03:32 +0100 Subject: [PATCH] Add scraping for archives site --- python/scrape/get_images_montreuil.py | 82 +++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 python/scrape/get_images_montreuil.py diff --git a/python/scrape/get_images_montreuil.py b/python/scrape/get_images_montreuil.py new file mode 100644 index 0000000..3f426ef --- /dev/null +++ b/python/scrape/get_images_montreuil.py @@ -0,0 +1,82 @@ +import random +import urllib +import requests +import time +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import os +import sys +import tldextract + +# URL of the webpage with images +input_url = sys.argv[1] + +# extract full domain +def split_domain_or_subdomain_and_path(url): + # Parse the URL + parsed_url = urlparse(url) + extracted = tldextract.extract(url) + + # Build the full domain, including subdomain if present + if extracted.subdomain: + full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}" + else: + full_domain = f"{extracted.domain}.{extracted.suffix}" + + return "https://" + full_domain + +full_domain = split_domain_or_subdomain_and_path(input_url) +print(f"Domain/Subdomain: {full_domain}") + +# Folder to save images +save_folder = "downloaded_images" +if not os.path.exists(save_folder): + os.makedirs(save_folder) + +# Send GET request to the page +#response = se.get(input_url) +user_agents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', +] +headers = { + "User-Agent": random.choice(user_agents) + } + +response = requests.get(input_url, headers=headers) +if response.status_code == 200: + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(response.text, 'html.parser') + + # Find all image tags + images = soup.find_all('img') + + # Loop through image tags + for idx, img in enumerate(images): + img_url = img.get('src') + + # Check if img_url is complete; if not, adjust it accordingly + if not img_url.startswith("http"): + img_url = full_domain + "/" + img_url + img_url = img_url.split("&") + img_url = img_url[0] + print(img_url) + try: + print(img_url) + # Send request to the image URL + img_data = requests.get(img_url, headers=headers).content + # Define file name and path + img_name = os.path.join(save_folder, f"image_{idx}.jpg") + # Write image data to file + with open(img_name, 'wb') as img_bytes: + img_bytes.write(img_data) + img_bytes.write(requests.get(img_url).content) + + print(f"Downloaded {img_name}") + time.sleep(1) + + except Exception as e: + print(f"Failed to download {img_url}. Error: {e}") +else: + print("Failed to retrieve the page.")