#!/usr/bin/env python3 #coding: utf8 #Scraper from bs4 import BeautifulSoup #Connexion web import mechanicalsoup import wget from os import mkdir, listdir from re import findall from time import sleep from threading import Thread ''' ############## 4chan thread scrapper ################ here we look for particular threads on 4chan and dl their images ''' def getArgs(): '''Gets all the arguments passed to the script and returns them in a parse_args()-type object. No args Returns: -args : an args object containing all the optional arguments passed to the script. ''' import argparse parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true") parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False) #Creating the args object args=parser.parse_args() return args def thread_finder(homepage, keyword): ''' returns a list of all the threads where op said keyword on 4chan homepage Args: - homepage: bs4 soup object containing html from the homepage of the board - keyword : any single word Returns: - hrefs : all the references to matching threads. They must be appended to homepage to work ''' href = homepage.findAll('a', {'title': 'Reply to this post'}) threads = homepage.findAll('div', {'class': 'thread'}) hrefs = [] for thread in threads: texts = thread.findAll('blockquote', {'class' : 'postMessage'}) for text in texts: if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower(): links = thread.findAll('a', {'title': 'Reply to this post'}) for link in links: hrefs.append(f"{link['href']}") return hrefs def html_get(url): ''' Get html from the webpage Args: - url : a str containing url to scrap Returns: - page.soup: A BeautifulSoup object containing html ''' #Browser browser = mechanicalsoup.Browser() try: page = browser.get(url) print(f"Got html from {url}") return page.soup except Exception as e: print(f"Got error {e}") def scraper(soup): ''' Scrape a bs4 html object, find posts w/ images and get full size source Args: - soup : bs4 soup item - item : tag to find Returns: - sources : A list of image sources ''' tags=soup.findAll('a', {"class": "fileThumb"}) sources = [] for tag in tags: sources.append(tag['href']) #Got image sources, removing any left http headers for index, item in enumerate(sources): sources[index] = item.replace("http://", "") sources[index] = item.replace("https://", "") sources[index] = item.replace("//", "") return sources def item_dl(sources, dlfolder): ''' Download all items in the sources list to folder dlfolder, which we try to create" Args: - sources : a list of URLs ''' #Making folder try: mkdir(dlfolder) except FileExistsError: print(f"{dlfolder} already exists, not creating") #Deduplicating imagenames = [] dir_content = listdir(dlfolder) for source in sources: fullsource = "http://" + source imagename = findall(r"[^\/]*$", source)[0] if imagename[:-4] not in str(dir_content): name = wget.download(fullsource, out=dlfolder) print(f"{name} downloaded") return True def constant_dl(folder, url): ''' Constantly download... Args: - folder: folder to dl into - url : board to watch ''' while True: sleep(2) soup = html_get(url) hrefs = thread_finder(soup, keyword) sources = scraper(soup) #item_dl(sources, folder) #Dling all threads found #oneshot for href in hrefs: print(f"going after {url}{href}") subsoup = html_get(f"{url}{href}") subsources = scraper(subsoup) print(subsources) item_dl(subsources, folder) #Getting main elements args = getArgs() folder = args.folder keyword = args.keyword if args.threads: threadnumber = args.threads else: threadnumber = 2 url = args.url soup = html_get(url) hrefs = thread_finder(soup, keyword) sources = scraper(soup) #item_dl(sources, folder) #Dling all threads found #oneshot if not args.constant: for href in hrefs: print(f"going after {url}{href}") subsoup = html_get(f"{url}{href}") subsources = scraper(subsoup) print(subsources) item_dl(subsources, folder) else: thread_objects = [] for i in range (1, threadnumber): thread_objects.append(Thread(target=constant_dl, args=(folder, url))) for thread in thread_objects: thread.start()