diff --git a/python/scrape/README.md b/python/scrape/README.md new file mode 100644 index 0000000..8c5c3b7 --- /dev/null +++ b/python/scrape/README.md @@ -0,0 +1,12 @@ +## A script that scrapes images from a given URL +we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service. + +``` +pip install requests beautifulsoup4 tldextract + +``` + +Run the script with: +``` +python cyberfeminist_images.py +``` diff --git a/python/scrape/get_images.py b/python/scrape/get_images.py new file mode 100644 index 0000000..277ac92 --- /dev/null +++ b/python/scrape/get_images.py @@ -0,0 +1,66 @@ +import requests +import time +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import os +import sys +import tldextract + +# URL of the webpage with images +input_url = sys.argv[1] + +# extract full domain +def split_domain_or_subdomain_and_path(url): + # Parse the URL + parsed_url = urlparse(url) + extracted = tldextract.extract(url) + + # Build the full domain, including subdomain if present + if extracted.subdomain: + full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}" + else: + full_domain = f"{extracted.domain}.{extracted.suffix}" + + return "https://" + full_domain + +full_domain = split_domain_or_subdomain_and_path(input_url) +print(f"Domain/Subdomain: {full_domain}") + +# Folder to save images +save_folder = "downloaded_images" +if not os.path.exists(save_folder): + os.makedirs(save_folder) + +# Send GET request to the page +response = requests.get(input_url) +if response.status_code == 200: + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(response.text, 'html.parser') + + # Find all image tags + images = soup.find_all('img') + + # Loop through image tags + for idx, img in enumerate(images): + img_url = img.get('src') + + # Check if img_url is complete; if not, adjust it accordingly + if not img_url.startswith("http"): + img_url = full_domain + "/" + img_url + + try: + # Send request to the image URL + img_data = requests.get(img_url).content + # Define file name and path + img_name = os.path.join(save_folder, f"image_{idx}.jpg") + # Write image data to file + with open(img_name, 'wb') as handler: + handler.write(img_data) + + print(f"Downloaded {img_name}") + time.sleep(1) + + except Exception as e: + print(f"Failed to download {img_url}. Error: {e}") +else: + print("Failed to retrieve the page.")