Add scraping for archives site
This commit is contained in:
82
python/scrape/get_images_montreuil.py
Normal file
82
python/scrape/get_images_montreuil.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import random
|
||||
import urllib
|
||||
import requests
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
import sys
|
||||
import tldextract
|
||||
|
||||
# URL of the webpage with images
|
||||
input_url = sys.argv[1]
|
||||
|
||||
# extract full domain
|
||||
def split_domain_or_subdomain_and_path(url):
|
||||
# Parse the URL
|
||||
parsed_url = urlparse(url)
|
||||
extracted = tldextract.extract(url)
|
||||
|
||||
# Build the full domain, including subdomain if present
|
||||
if extracted.subdomain:
|
||||
full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
|
||||
else:
|
||||
full_domain = f"{extracted.domain}.{extracted.suffix}"
|
||||
|
||||
return "https://" + full_domain
|
||||
|
||||
full_domain = split_domain_or_subdomain_and_path(input_url)
|
||||
print(f"Domain/Subdomain: {full_domain}")
|
||||
|
||||
# Folder to save images
|
||||
save_folder = "downloaded_images"
|
||||
if not os.path.exists(save_folder):
|
||||
os.makedirs(save_folder)
|
||||
|
||||
# Send GET request to the page
|
||||
#response = se.get(input_url)
|
||||
user_agents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
|
||||
]
|
||||
headers = {
|
||||
"User-Agent": random.choice(user_agents)
|
||||
}
|
||||
|
||||
response = requests.get(input_url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
# Parse the HTML content with BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Find all image tags
|
||||
images = soup.find_all('img')
|
||||
|
||||
# Loop through image tags
|
||||
for idx, img in enumerate(images):
|
||||
img_url = img.get('src')
|
||||
|
||||
# Check if img_url is complete; if not, adjust it accordingly
|
||||
if not img_url.startswith("http"):
|
||||
img_url = full_domain + "/" + img_url
|
||||
img_url = img_url.split("&")
|
||||
img_url = img_url[0]
|
||||
print(img_url)
|
||||
try:
|
||||
print(img_url)
|
||||
# Send request to the image URL
|
||||
img_data = requests.get(img_url, headers=headers).content
|
||||
# Define file name and path
|
||||
img_name = os.path.join(save_folder, f"image_{idx}.jpg")
|
||||
# Write image data to file
|
||||
with open(img_name, 'wb') as img_bytes:
|
||||
img_bytes.write(img_data)
|
||||
img_bytes.write(requests.get(img_url).content)
|
||||
|
||||
print(f"Downloaded {img_name}")
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to download {img_url}. Error: {e}")
|
||||
else:
|
||||
print("Failed to retrieve the page.")
|
||||
Reference in New Issue
Block a user