art_num_2024/python/scrape/get_images_montreuil.py

import random
import urllib
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import os
import sys
import tldextract

# URL of the webpage with images
input_url = sys.argv[1]

# extract full domain
def split_domain_or_subdomain_and_path(url):
    # Parse the URL
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)

    # Build the full domain, including subdomain if present
    if extracted.subdomain:
        full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
    else:
        full_domain = f"{extracted.domain}.{extracted.suffix}"

    return "https://" + full_domain

full_domain = split_domain_or_subdomain_and_path(input_url)
print(f"Domain/Subdomain: {full_domain}")

# Folder to save images
save_folder = "downloaded_images"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# Send GET request to the page
#response = se.get(input_url)
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
]
headers = {
        "User-Agent": random.choice(user_agents)
        }

response = requests.get(input_url, headers=headers)
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all image tags
    images = soup.find_all('img')

    # Loop through image tags
    for idx, img in enumerate(images):
        img_url = img.get('src')

        # Check if img_url is complete; if not, adjust it accordingly
        if not img_url.startswith("http"):
            img_url = full_domain + "/" + img_url
            img_url = img_url.split("&")
            img_url = img_url[0]
            print(img_url)
        try:
            print(img_url)
            # Send request to the image URL
            img_data = requests.get(img_url, headers=headers).content
            # Define file name and path
            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
            # Write image data to file
            with open(img_name, 'wb') as img_bytes:
                img_bytes.write(img_data)
                img_bytes.write(requests.get(img_url).content)

            print(f"Downloaded {img_name}")
            time.sleep(1)

        except Exception as e:
            print(f"Failed to download {img_url}. Error: {e}")
else:
    print("Failed to retrieve the page.")