add python image scrape script

add gitignore file
2024-10-31 19:25:04 +01:00 · 2024-10-31 19:06:32 +01:00
4 changed files with 88 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# Environments
+venv/
+.venv/
+/pyvenv.cfg
+.python-version
+
+# Media
+media/
+downloaded_images
+downloaded_videos
--- a/README.md
+++ b/README.md
@@ -13,4 +13,3 @@ test port ssh
 ## content 

 
-coucou
--- a/python/scrape/README.md
+++ b/python/scrape/README.md
@@ -0,0 +1,12 @@
+## A script that scrapes images from a given URL
+we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service.
+
+```
+pip install requests beautifulsoup4 tldextract
+
+```
+
+Run the script with:
+```
+python cyberfeminist_images.py
+```
--- a/python/scrape/get_images.py
+++ b/python/scrape/get_images.py
@@ -0,0 +1,66 @@
+import requests
+import time
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import os
+import sys
+import tldextract
+
+# URL of the webpage with images
+input_url = sys.argv[1] 
+
+# extract full domain
+def split_domain_or_subdomain_and_path(url):
+    # Parse the URL
+    parsed_url = urlparse(url)
+    extracted = tldextract.extract(url)
+    
+    # Build the full domain, including subdomain if present
+    if extracted.subdomain:
+        full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
+    else:
+        full_domain = f"{extracted.domain}.{extracted.suffix}"
+    
+    return "https://" + full_domain
+
+full_domain = split_domain_or_subdomain_and_path(input_url)
+print(f"Domain/Subdomain: {full_domain}")
+
+# Folder to save images
+save_folder = "downloaded_images"
+if not os.path.exists(save_folder):
+    os.makedirs(save_folder)
+
+# Send GET request to the page
+response = requests.get(input_url)
+if response.status_code == 200:
+    # Parse the HTML content with BeautifulSoup
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Find all image tags
+    images = soup.find_all('img')
+
+    # Loop through image tags
+    for idx, img in enumerate(images):
+        img_url = img.get('src')
+        
+        # Check if img_url is complete; if not, adjust it accordingly
+        if not img_url.startswith("http"):
+            img_url = full_domain + "/" + img_url
+
+        try:
+            # Send request to the image URL
+            img_data = requests.get(img_url).content
+            # Define file name and path
+            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
+            # Write image data to file
+            with open(img_name, 'wb') as handler:
+                handler.write(img_data)
+
+            print(f"Downloaded {img_name}")
+            time.sleep(1)
+        
+        except Exception as e:
+            print(f"Failed to download {img_url}. Error: {e}")
+else:
+    print("Failed to retrieve the page.")
Author	SHA1	Message	Date
Mara Karagianni	ad3a364347	add python image scrape script	2024-10-31 19:25:04 +01:00
Mara Karagianni	7ef8f2ffd5	add gitignore file	2024-10-31 19:06:32 +01:00