add python image scrape script

2024-10-31 19:25:04 +01:00
parent 7ef8f2ffd5
commit ad3a364347
2 changed files with 78 additions and 0 deletions
--- a/python/scrape/README.md
+++ b/python/scrape/README.md
@@ -0,0 +1,12 @@
 ## A script that scrapes images from a given URL
 we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service.
 ```
 pip install requests beautifulsoup4 tldextract
 ```
 Run the script with:
 ```
 python cyberfeminist_images.py
 ```
--- a/python/scrape/get_images.py
+++ b/python/scrape/get_images.py
@@ -0,0 +1,66 @@
 import requests
 import time
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 import os
 import sys
 import tldextract
 # URL of the webpage with images
 input_url = sys.argv[1] 
 # extract full domain
 def split_domain_or_subdomain_and_path(url):
    # Parse the URL
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)
    # Build the full domain, including subdomain if present
    if extracted.subdomain:
        full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
    else:
        full_domain = f"{extracted.domain}.{extracted.suffix}"
    return "https://" + full_domain
 full_domain = split_domain_or_subdomain_and_path(input_url)
 print(f"Domain/Subdomain: {full_domain}")
 # Folder to save images
 save_folder = "downloaded_images"
 if not os.path.exists(save_folder):
    os.makedirs(save_folder)
 # Send GET request to the page
 response = requests.get(input_url)
 if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find all image tags
    images = soup.find_all('img')
    # Loop through image tags
    for idx, img in enumerate(images):
        img_url = img.get('src')
        # Check if img_url is complete; if not, adjust it accordingly
        if not img_url.startswith("http"):
            img_url = full_domain + "/" + img_url
        try:
            # Send request to the image URL
            img_data = requests.get(img_url).content
            # Define file name and path
            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
            # Write image data to file
            with open(img_name, 'wb') as handler:
                handler.write(img_data)
            print(f"Downloaded {img_name}")
            time.sleep(1)
        except Exception as e:
            print(f"Failed to download {img_url}. Error: {e}")
 else:
    print("Failed to retrieve the page.")