import random import urllib import requests import time from bs4 import BeautifulSoup from urllib.parse import urlparse import os import sys import tldextract # URL of the webpage with images input_url = sys.argv[1] # extract full domain def split_domain_or_subdomain_and_path(url): # Parse the URL parsed_url = urlparse(url) extracted = tldextract.extract(url) # Build the full domain, including subdomain if present if extracted.subdomain: full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}" else: full_domain = f"{extracted.domain}.{extracted.suffix}" return "https://" + full_domain full_domain = split_domain_or_subdomain_and_path(input_url) print(f"Domain/Subdomain: {full_domain}") # Folder to save images save_folder = "downloaded_images" if not os.path.exists(save_folder): os.makedirs(save_folder) # Send GET request to the page user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15', 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', ] headers = { "User-Agent": random.choice(user_agents) } response = requests.get(input_url, headers=headers) if response.status_code == 200: # Parse the HTML content with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Find all image tags images = soup.find_all('img') # Loop through image tags for idx, img in enumerate(images): img_url = img.get('src') # Check if img_url is complete; if not, adjust it accordingly if not img_url.startswith("http"): img_url = full_domain + "/" + img_url img_url = img_url.split("&") img_url = img_url[0] try: # Send request to the image URL img_data = requests.get(img_url, headers=headers).content # Define file name and path img_name = os.path.join(save_folder, f"image_{idx}.jpg") # Write image data to file with open(img_name, 'wb') as img_bytes: img_bytes.write(img_data) print(f"Downloaded {img_name}") time.sleep(1) except Exception as e: print(f"Failed to download {img_url}. Error: {e}") else: print("Failed to retrieve the page.")