ImageScrapper/scrape.py

#!/usr/bin/env python3
#coding: utf8

#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir, listdir
'''
############## 4chan thread scrapper ################
Give me a 4chan thread, i'll download all of its pictures !
'''


def getArgs():
    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
    No args
    Returns:
    -args : an args object containing all the optional arguments passed to the script.
    '''
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
    #Creating the args object
    args=parser.parse_args()

    return args


def html_get(url):
    '''
    Get html from the webpage
    Args:
    - url : a str containing url to scrap
    Returns:
    - page.soup: A BeautifulSoup object containing html
    '''
    #Browser
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
        print(f"Got html from {url}")
        return page.soup
    except Exception as e:
        print(f"Got error {e}")


def scraper(soup):
    '''
    Scrape a bs4 html object, find posts w/ images and get full size source
    Args:
    - soup : bs4 soup item
    - item : tag to find
    Returns:
    - sources : A list of image sources
    '''
    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
        sources.append(tag['href'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
        sources[index] = item.replace("http://", "")
        sources[index] = item.replace("https://", "")
        sources[index] = item.replace("//", "")
    print(f"{len(sources)} images found")
    return sources


def item_dl(sources, dlfolder):
    '''
    Download all items in the sources list to folder dlfolder, which we try to create"
    Args:
    - sources : a list of URLs
    '''
    try:
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")

    for source in sources:
        fullsource = "http://" + source
        if dlfolder[-1] == "/":
            dlfolder = dlfolder[:-1]
        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
            name = wget.download(fullsource, out=dlfolder)
            print(f"{name} downloaded")
        else:
            print(f"{source} is already there")

def deduplicate(url, folder):
    '''
    Takes a url to an image and a folder, check if said image exists in said folder
    Args:
    - url : a str containing a full url to an image
    - folder : Name of a folder in /Images
    Returns:
    - True : The image is already in the folder
    - False : The image is not in the folder
    '''
    image_name = url.split("/")[-1]
    image_name = image_name.split("?")[0]
    files = listdir(f"/Images/{folder}")

    for i in files:
        if i == image_name:
            return True
    return False


args = getArgs()
folder = args.folder
url = args.url
#url = "https://boards.4chan.org/b/"
#url = 'https://www.deviantart.com/'
soup = html_get(url)
sources = scraper(soup)
item_dl(sources, folder)