mirror of
https://git.erg.school/P039/art_num_2024.git
synced 2026-02-04 13:09:20 +01:00
add user agents in scraping images script
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
import random
|
||||||
|
import urllib
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -32,7 +34,16 @@ if not os.path.exists(save_folder):
|
|||||||
os.makedirs(save_folder)
|
os.makedirs(save_folder)
|
||||||
|
|
||||||
# Send GET request to the page
|
# Send GET request to the page
|
||||||
response = requests.get(input_url)
|
user_agents = [
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
|
||||||
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
|
||||||
|
]
|
||||||
|
headers = {
|
||||||
|
"User-Agent": random.choice(user_agents)
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(input_url, headers=headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
# Parse the HTML content with BeautifulSoup
|
# Parse the HTML content with BeautifulSoup
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
@@ -47,10 +58,11 @@ if response.status_code == 200:
|
|||||||
# Check if img_url is complete; if not, adjust it accordingly
|
# Check if img_url is complete; if not, adjust it accordingly
|
||||||
if not img_url.startswith("http"):
|
if not img_url.startswith("http"):
|
||||||
img_url = full_domain + "/" + img_url
|
img_url = full_domain + "/" + img_url
|
||||||
|
img_url = img_url.split("&")
|
||||||
|
img_url = img_url[0]
|
||||||
try:
|
try:
|
||||||
# Send request to the image URL
|
# Send request to the image URL
|
||||||
img_data = requests.get(img_url).content
|
img_data = requests.get(img_url, headers=headers).content
|
||||||
# Define file name and path
|
# Define file name and path
|
||||||
img_name = os.path.join(save_folder, f"image_{idx}.jpg")
|
img_name = os.path.join(save_folder, f"image_{idx}.jpg")
|
||||||
# Write image data to file
|
# Write image data to file
|
||||||
|
|||||||
Reference in New Issue
Block a user