add python image scrape script

add gitignore file
2024-10-31 19:25:04 +01:00 · 2024-10-31 19:06:32 +01:00
5 changed files with 89 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
 # Environments
 venv/
 .venv/
 /pyvenv.cfg
 .python-version
 # Media
 media/
 downloaded_images
 downloaded_videos
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # git repo for art num
 *the wiki will be updated with more information and usefull snipet. fell free to contribute*
 test port ssh 
--- a/python/scrape/README.md
+++ b/python/scrape/README.md
@@ -0,0 +1,12 @@
 ## A script that scrapes images from a given URL
 we need to use the requests and BeautifulSoup libraries to retrieve and parse the HTML content. `os` and `shutil` are helpful for managing files and saving the images. Scraping should be done ethically, following the website's robots.txt rules and terms of service.
 ```
 pip install requests beautifulsoup4 tldextract
 ```
 Run the script with:
 ```
 python cyberfeminist_images.py
 ```
--- a/python/scrape/get_images.py
+++ b/python/scrape/get_images.py
@@ -0,0 +1,66 @@
 import requests
 import time
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 import os
 import sys
 import tldextract
 # URL of the webpage with images
 input_url = sys.argv[1] 
 # extract full domain
 def split_domain_or_subdomain_and_path(url):
    # Parse the URL
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)
    # Build the full domain, including subdomain if present
    if extracted.subdomain:
        full_domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
    else:
        full_domain = f"{extracted.domain}.{extracted.suffix}"
    return "https://" + full_domain
 full_domain = split_domain_or_subdomain_and_path(input_url)
 print(f"Domain/Subdomain: {full_domain}")
 # Folder to save images
 save_folder = "downloaded_images"
 if not os.path.exists(save_folder):
    os.makedirs(save_folder)
 # Send GET request to the page
 response = requests.get(input_url)
 if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find all image tags
    images = soup.find_all('img')
    # Loop through image tags
    for idx, img in enumerate(images):
        img_url = img.get('src')
        # Check if img_url is complete; if not, adjust it accordingly
        if not img_url.startswith("http"):
            img_url = full_domain + "/" + img_url
        try:
            # Send request to the image URL
            img_data = requests.get(img_url).content
            # Define file name and path
            img_name = os.path.join(save_folder, f"image_{idx}.jpg")
            # Write image data to file
            with open(img_name, 'wb') as handler:
                handler.write(img_data)
            print(f"Downloaded {img_name}")
            time.sleep(1)
        except Exception as e:
            print(f"Failed to download {img_url}. Error: {e}")
 else:
    print("Failed to retrieve the page.")
--- a/references/python_arti_ref_mara.md
+++ b/references/python_arti_ref_mara.md
@@ -1,31 +0,0 @@
 # artistic ref : usage de python
 ## [Computational Poems : Les deux, Nick Montfort](https://nickm.com/2/les_deux.html) 
  - US digital artist / chercheur
  - générateur de poème online dynamique (javascript)
  - poème multilangue (fr, esp, cn) => dispositif de traduction (js)
 ## [The Great Netfix, *Ritasdatter & Gansing*](http://netflix.lnd4.net/)
 *a video store after the end of the world*
 - notion de de-clouding : proposition speculative de redistribution de la “cloub-base” contemporaine
 - activité de **scraping** de film Netflix via VPN (utilitaire de deplacement immatérielle de la localisation du client) & enregistrement **VHS**
 - dispositif rspi (WLAN) - tape recorder VHS
 ## [Videogrep, *Sam Lavigne* (2014)](https://antiboredom.github.io/videogrep/)
 - python script that searches through dialog on videos and combine then in a flesh video
 - e.g : condense toute les itération d’une expression d’une video originale
 - visibilisation de normalisation d’usage de stratégie marketing (element de langage) dans contexte politique -partisant-
 - commande ligne tool / python module en libre acces sur archive github du project
  ```videogrep -- input path/to/vid.mp4 --search 'search phrase'```
 ## [Unerasable Characters, *Winnie Soon*](https://calls.ars.electronica.art/2023/prix/winners/7149/)
 Prix Ars Electronica, 2023
 - scraping data censurées/suprimées from Weibo (chinese social media == twitter)
 - dispersion des ideogram dans matrice lumineuse physique
 - concatenation de l’ensemble des caractère par machine learning (Tensor Flow) pour republication sur source (Weibo) et production d’une édition physique
Author	SHA1	Message	Date
Mara Karagianni	ad3a364347	add python image scrape script	2024-10-31 19:25:04 +01:00
Mara Karagianni	7ef8f2ffd5	add gitignore file	2024-10-31 19:06:32 +01:00