add user agents in scraping images script

2025-04-22 20:11:41 +02:00
parent a09846193d
commit 7d55d258ac
1 changed files with 15 additions and 3 deletions
--- a/python/scrape/get_images.py
+++ b/python/scrape/get_images.py
@@ -1,3 +1,5 @@
+import random
+import urllib
 import requests
 import time
 from bs4 import BeautifulSoup
@@ -32,7 +34,16 @@ if not os.path.exists(save_folder):
    os.makedirs(save_folder)

 # Send GET request to the page
-response = requests.get(input_url)
+user_agents = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
+]
+headers = {
+        "User-Agent": random.choice(user_agents)
+        }
+
+response = requests.get(input_url, headers=headers)
 if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
@@ -47,10 +58,11 @@ if response.status_code == 200:
        # Check if img_url is complete; if not, adjust it accordingly
        if not img_url.startswith("http"):
            img_url = full_domain + "/" + img_url
-
+            img_url = img_url.split("&")
+            img_url = img_url[0]
        try:
            # Send request to the image URL
-            img_data = requests.get(img_url).content
+            img_data = requests.get(img_url, headers=headers).content
            # Define file name and path
            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
            # Write image data to file