#!/usr/bin/env python3 #coding: utf8 #Scraper from bs4 import BeautifulSoup #Connexion web import mechanicalsoup import wget from os import mkdir, listdir ''' ############## 4chan thread scrapper ################ Give me a 4chan thread, i'll download all of its pictures ! ''' def getArgs(): '''Gets all the arguments passed to the script and returns them in a parse_args()-type object. No args Returns: -args : an args object containing all the optional arguments passed to the script. ''' import argparse parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) #Creating the args object args=parser.parse_args() return args def html_get(url): ''' Get html from the webpage Args: - url : a str containing url to scrap Returns: - page.soup: A BeautifulSoup object containing html ''' #Browser browser = mechanicalsoup.Browser() try: page = browser.get(url) print(f"Got html from {url}") return page.soup except Exception as e: print(f"Got error {e}") def scraper(soup): ''' Scrape a bs4 html object, find posts w/ images and get full size source Args: - soup : bs4 soup item - item : tag to find Returns: - sources : A list of image sources ''' tags=soup.findAll('a', {"class": "fileThumb"}) sources = [] for tag in tags: sources.append(tag['href']) #Got image sources, removing any left http headers for index, item in enumerate(sources): sources[index] = item.replace("http://", "") sources[index] = item.replace("https://", "") sources[index] = item.replace("//", "") print(f"{len(sources)} images found") return sources def item_dl(sources, dlfolder): ''' Download all items in the sources list to folder dlfolder, which we try to create" Args: - sources : a list of URLs ''' try: mkdir(dlfolder) except FileExistsError: print(f"{dlfolder} already exists, not creating") for source in sources: fullsource = "http://" + source if dlfolder[-1] == "/": dlfolder = dlfolder[:-1] if not deduplicate(fullsource, dlfolder.split("/")[-1]): name = wget.download(fullsource, out=dlfolder) print(f"{name} downloaded") else: print(f"{source} is already there") def deduplicate(url, folder): ''' Takes a url to an image and a folder, check if said image exists in said folder Args: - url : a str containing a full url to an image - folder : Name of a folder in /Images Returns: - True : The image is already in the folder - False : The image is not in the folder ''' image_name = url.split("/")[-1] image_name = image_name.split("?")[0] files = listdir(f"/Images/{folder}") for i in files: if i == image_name: return True return False args = getArgs() folder = args.folder url = args.url #url = "https://boards.4chan.org/b/" #url = 'https://www.deviantart.com/' soup = html_get(url) sources = scraper(soup) item_dl(sources, folder)