#!/usr/bin/env python3 #coding: utf8 #Scraper from bs4 import BeautifulSoup #Connexion web import mechanicalsoup import wget from os import mkdir ''' ############## 4chan thread scrapper ################ Give me a thread, i'll download all of its pictures ! ''' def getArgs(): '''Gets all the arguments passed to the script and returns them in a parse_args()-type object. No args Returns: -args : an args object containing all the optional arguments passed to the script. ''' import argparse parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) #Creating the args object args=parser.parse_args() return args def html_get(url): ''' Get html from the webpage Args: - url : a str containing url to scrap Returns: - page.soup: A BeautifulSoup object containing html ''' #Browser browser = mechanicalsoup.Browser() try: page = browser.get(url) print(f"Got html from {url}") return page.soup except Exception as e: print(f"Got error {e}") def scraper(soup): ''' Scrape a bs4 html object, find posts w/ images and get full size source Args: - soup : bs4 soup item - item : tag to find Returns: - sources : A list of image sources ''' tags=soup.findAll('a', {"class": "fileThumb"}) sources = [] for tag in tags: sources.append(tag['href']) #Got image sources, removing any left http headers for index, item in enumerate(sources): sources[index] = item.replace("http://", "") sources[index] = item.replace("https://", "") sources[index] = item.replace("//", "") print(f"{len(sources)} images found") return sources def item_dl(sources, dlfolder): ''' Download all items in the sources list to folder dlfolder, which we try to create" Args: - sources : a list of URLs ''' try: mkdir(dlfolder) except FileExistsError: print(f"{dlfolder} already exists, not creating") for source in sources: fullsource = "http://" + source name = wget.download(fullsource, out=dlfolder) print(f"{name} downloaded") args = getArgs() folder = args.folder url = args.url #url = "https://boards.4chan.org/b/" #url = 'https://www.deviantart.com/' soup = html_get(url) sources = scraper(soup) item_dl(sources, folder)