Compare commits
1 Commits
ad3a364347
...
maxime
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f37bb67a26 |
10
.gitignore
vendored
10
.gitignore
vendored
@@ -1,10 +0,0 @@
|
|||||||
# Environments
|
|
||||||
venv/
|
|
||||||
.venv/
|
|
||||||
/pyvenv.cfg
|
|
||||||
.python-version
|
|
||||||
|
|
||||||
# Media
|
|
||||||
media/
|
|
||||||
downloaded_images
|
|
||||||
downloaded_videos
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
## A script that scrapes images from a given URL
|
|
||||||
we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service.
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install requests beautifulsoup4 tldextract
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
Run the script with:
|
|
||||||
```
|
|
||||||
python cyberfeminist_images.py
|
|
||||||
```
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
import requests
|
|
||||||
import time
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import tldextract
|
|
||||||
|
|
||||||
# URL of the webpage with images
|
|
||||||
input_url = sys.argv[1]
|
|
||||||
|
|
||||||
# extract full domain
|
|
||||||
def split_domain_or_subdomain_and_path(url):
|
|
||||||
# Parse the URL
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
extracted = tldextract.extract(url)
|
|
||||||
|
|
||||||
# Build the full domain, including subdomain if present
|
|
||||||
if extracted.subdomain:
|
|
||||||
full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
|
|
||||||
else:
|
|
||||||
full_domain = f"{extracted.domain}.{extracted.suffix}"
|
|
||||||
|
|
||||||
return "https://" + full_domain
|
|
||||||
|
|
||||||
full_domain = split_domain_or_subdomain_and_path(input_url)
|
|
||||||
print(f"Domain/Subdomain: {full_domain}")
|
|
||||||
|
|
||||||
# Folder to save images
|
|
||||||
save_folder = "downloaded_images"
|
|
||||||
if not os.path.exists(save_folder):
|
|
||||||
os.makedirs(save_folder)
|
|
||||||
|
|
||||||
# Send GET request to the page
|
|
||||||
response = requests.get(input_url)
|
|
||||||
if response.status_code == 200:
|
|
||||||
# Parse the HTML content with BeautifulSoup
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
|
|
||||||
# Find all image tags
|
|
||||||
images = soup.find_all('img')
|
|
||||||
|
|
||||||
# Loop through image tags
|
|
||||||
for idx, img in enumerate(images):
|
|
||||||
img_url = img.get('src')
|
|
||||||
|
|
||||||
# Check if img_url is complete; if not, adjust it accordingly
|
|
||||||
if not img_url.startswith("http"):
|
|
||||||
img_url = full_domain + "/" + img_url
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Send request to the image URL
|
|
||||||
img_data = requests.get(img_url).content
|
|
||||||
# Define file name and path
|
|
||||||
img_name = os.path.join(save_folder, f"image_{idx}.jpg")
|
|
||||||
# Write image data to file
|
|
||||||
with open(img_name, 'wb') as handler:
|
|
||||||
handler.write(img_data)
|
|
||||||
|
|
||||||
print(f"Downloaded {img_name}")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to download {img_url}. Error: {e}")
|
|
||||||
else:
|
|
||||||
print("Failed to retrieve the page.")
|
|
||||||
Reference in New Issue
Block a user