From a09846193d8780008f868dc8a008ffbd483c3511 Mon Sep 17 00:00:00 2001
From: Mara Karagianni <m.karagianni@liqd.net>
Date: Thu, 13 Mar 2025 18:03:32 +0100
Subject: [PATCH] Add scraping for archives site

---
 python/scrape/get_images_montreuil.py | 82 +++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 python/scrape/get_images_montreuil.py

diff --git a/python/scrape/get_images_montreuil.py b/python/scrape/get_images_montreuil.py
new file mode 100644
index 0000000..3f426ef
--- /dev/null
+++ b/python/scrape/get_images_montreuil.py
@@ -0,0 +1,82 @@
+import random
+import urllib
+import requests
+import time
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import os
+import sys
+import tldextract
+
+# URL of the webpage with images
+input_url = sys.argv[1] 
+
+# extract full domain
+def split_domain_or_subdomain_and_path(url):
+    # Parse the URL
+    parsed_url = urlparse(url)
+    extracted = tldextract.extract(url)
+    
+    # Build the full domain, including subdomain if present
+    if extracted.subdomain:
+        full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
+    else:
+        full_domain = f"{extracted.domain}.{extracted.suffix}"
+    
+    return "https://" + full_domain
+
+full_domain = split_domain_or_subdomain_and_path(input_url)
+print(f"Domain/Subdomain: {full_domain}")
+
+# Folder to save images
+save_folder = "downloaded_images"
+if not os.path.exists(save_folder):
+    os.makedirs(save_folder)
+
+# Send GET request to the page
+#response = se.get(input_url)
+user_agents = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
+]
+headers = {
+        "User-Agent": random.choice(user_agents)
+        }
+
+response = requests.get(input_url, headers=headers)
+if response.status_code == 200:
+    # Parse the HTML content with BeautifulSoup
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Find all image tags
+    images = soup.find_all('img')
+
+    # Loop through image tags
+    for idx, img in enumerate(images):
+        img_url = img.get('src')
+        
+        # Check if img_url is complete; if not, adjust it accordingly
+        if not img_url.startswith("http"):
+            img_url = full_domain + "/" + img_url
+            img_url = img_url.split("&")
+            img_url = img_url[0]
+            print(img_url)
+        try:
+            print(img_url)
+            # Send request to the image URL
+            img_data = requests.get(img_url, headers=headers).content
+            # Define file name and path
+            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
+            # Write image data to file
+            with open(img_name, 'wb') as img_bytes:
+                img_bytes.write(img_data)
+                img_bytes.write(requests.get(img_url).content)
+
+            print(f"Downloaded {img_name}")
+            time.sleep(1)
+        
+        except Exception as e:
+            print(f"Failed to download {img_url}. Error: {e}")
+else:
+    print("Failed to retrieve the page.")