add python image scrape script

This commit is contained in:
Mara Karagianni
2024-10-31 19:25:04 +01:00
parent 7ef8f2ffd5
commit ad3a364347
2 changed files with 78 additions and 0 deletions

12
python/scrape/README.md Normal file
View File

@@ -0,0 +1,12 @@
## A script that scrapes images from a given URL
we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service.
```
pip install requests beautifulsoup4 tldextract
```
Run the script with:
```
python cyberfeminist_images.py
```

View File

@@ -0,0 +1,66 @@
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import os
import sys
import tldextract
# URL of the webpage with images
input_url = sys.argv[1]
# extract full domain
def split_domain_or_subdomain_and_path(url):
# Parse the URL
parsed_url = urlparse(url)
extracted = tldextract.extract(url)
# Build the full domain, including subdomain if present
if extracted.subdomain:
full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
else:
full_domain = f"{extracted.domain}.{extracted.suffix}"
return "https://" + full_domain
full_domain = split_domain_or_subdomain_and_path(input_url)
print(f"Domain/Subdomain: {full_domain}")
# Folder to save images
save_folder = "downloaded_images"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
# Send GET request to the page
response = requests.get(input_url)
if response.status_code == 200:
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find all image tags
images = soup.find_all('img')
# Loop through image tags
for idx, img in enumerate(images):
img_url = img.get('src')
# Check if img_url is complete; if not, adjust it accordingly
if not img_url.startswith("http"):
img_url = full_domain + "/" + img_url
try:
# Send request to the image URL
img_data = requests.get(img_url).content
# Define file name and path
img_name = os.path.join(save_folder, f"image_{idx}.jpg")
# Write image data to file
with open(img_name, 'wb') as handler:
handler.write(img_data)
print(f"Downloaded {img_name}")
time.sleep(1)
except Exception as e:
print(f"Failed to download {img_url}. Error: {e}")
else:
print("Failed to retrieve the page.")