From 7d55d258ac26e82783ed240402316bdb596e54e8 Mon Sep 17 00:00:00 2001 From: Mara Karagianni Date: Tue, 22 Apr 2025 20:11:41 +0200 Subject: [PATCH] add user agents in scraping images script --- python/scrape/get_images.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/python/scrape/get_images.py b/python/scrape/get_images.py index 8b5c60c..ad2a717 100644 --- a/python/scrape/get_images.py +++ b/python/scrape/get_images.py @@ -1,3 +1,5 @@ +import random +import urllib import requests import time from bs4 import BeautifulSoup @@ -32,7 +34,16 @@ if not os.path.exists(save_folder): os.makedirs(save_folder) # Send GET request to the page -response = requests.get(input_url) +user_agents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', +] +headers = { + "User-Agent": random.choice(user_agents) + } + +response = requests.get(input_url, headers=headers) if response.status_code == 200: # Parse the HTML content with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') @@ -47,10 +58,11 @@ if response.status_code == 200: # Check if img_url is complete; if not, adjust it accordingly if not img_url.startswith("http"): img_url = full_domain + "/" + img_url - + img_url = img_url.split("&") + img_url = img_url[0] try: # Send request to the image URL - img_data = requests.get(img_url).content + img_data = requests.get(img_url, headers=headers).content # Define file name and path img_name = os.path.join(save_folder, f"image_{idx}.jpg") # Write image data to file