#!/usr/bin/env python3 #coding: utf8 #Scraper from bs4 import BeautifulSoup #Connexion web import mechanicalsoup import wget from os import mkdir, listdir from sys import path from re import findall from time import sleep from threading import Thread import logging ''' ############## 4chan thread scrapper ################ This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread. Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk ! . \\ ,` ( ` SquiP ( \' " `-.__)_ ''' def getArgs(): '''Gets all the arguments passed to the script and returns them in a parse_args()-type object. No args Returns: -args : an args object containing all the optional arguments passed to the script. ''' import argparse parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True) parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False) parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true") #Creating the args object args=parser.parse_args() return args def thread_finder(homepage, keyword): ''' returns a list of all the threads where op said keyword on 4chan homepage Args: - homepage: bs4 soup object containing html from the homepage of the board - keyword : list of expressions to look for Returns: - hrefs : all the references to matching threads. They must be appended to homepage to work ''' href = homepage.findAll('a', {'title': 'Reply to this post'}) threads = homepage.findAll('div', {'class': 'thread'}) hrefs = [] for thread in threads: texts = thread.findAll('blockquote', {'class' : 'postMessage'}) for text in texts: for word in keyword: if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower(): print(f"Found {word}") links = thread.findAll('a', {'title': 'Reply to this post'}) for link in links: hrefs.append(f"{link['href']}") return hrefs def html_get(url): ''' Get html from the webpage Args: - url : a str containing url to scrap Returns: - page.soup: A BeautifulSoup object containing html ''' #Browser browser = mechanicalsoup.Browser() try: page = browser.get(url) return page.soup except Exception as e: print(f"Got error {e}") def scraper(soup): ''' Scrape a bs4 html object, find posts w/ images and get full size source Args: - soup : bs4 soup item - item : tag to find Returns: - sources : A list of image sources ''' tags=soup.findAll('a', {"class": "fileThumb"}) sources = [] for tag in tags: sources.append(tag['href']) #Got image sources, removing any left http headers for index, item in enumerate(sources): sources[index] = item.replace("http://", "") sources[index] = item.replace("https://", "") sources[index] = item.replace("//", "") return sources def item_dl(sources, dlfolder): ''' Download all items in the sources list to folder dlfolder, which we try to create" Args: - sources : a list of URLsi - global folder_content : see folder_watch() ''' global folder_content #Deduplicating imagenames = [] for source in sources: fullsource = "http://" + source imagename = findall(r"[^\/]*$", source)[0] if imagename[:-4] not in folder_content: name = wget.download(fullsource, out=dlfolder) print(f"{name} downloaded from {source}") return True def folder_create(dlfolder): ''' Create the folder if it does not exist Args: - dlfolder : path of folder to create ''' try: #Making folder mkdir(dlfolder) except FileExistsError: print(f"{dlfolder} already exists, not creating") def folder_watch(folder): ''' Watch for the content of a folder and return its content. Content is a string containing all the names of all the elements. Args: - folder : folder to watch - global folder_content : see folder_watch() Returns: folder_content : said string, containing all the names of all the files in the folder ''' global folder_content folder_list = listdir(folder) folder_content = "" for i in folder_list: folder_content += i def dl_threads(folder, url, log_enabled): ''' Constantly download... Args: - folder: folder to dl into - url : board to watch - log_enabled : Set True if logging lib is used ''' try: sleep(2) soup = html_get(url) hrefs = thread_finder(soup, keyword) sources = scraper(soup) #item_dl(sources, folder) #Dling all threads found #oneshot for href in hrefs: subsoup = html_get(f"{url}{href}") subsources = scraper(subsoup) folder_watch(folder) try: item_dl(subsources, folder) if log_enabled: logging.info(f"Downloaded {url}{href}") sleep(2) except HTTPSConnectionPool as ConnErr: if log_enabled: logging.error(f"Got Error {ConErr}, pipes must be clogged lulz") else: print(f"Got Error {ConErr}, pipes must be clogged lulz") except Exception as e: print(f"Houston, we had a problem with {url} and {folder}: \n{e}") #Getting main elements args = getArgs() folder = args.folder keyword = args.keyword if args.logfile: logfile = args.logfile #Creating Logfile logging.basicConfig(\ format='%(asctime)s %(levelname)-8s %(message)s',\ filename=f"{path[0]}/{args.logfile}",\ level = logging.INFO,\ datefmt='%Y-%m-%d %H:%M:%S'\ ) log_enabled = True else: log_enabled = False url = args.url soup = html_get(url) hrefs = thread_finder(soup, keyword) sources = scraper(soup) folder_create(folder) folder_content = "" print("Lurking...") #item_dl(sources, folder) #Dling all threads found #oneshot if not args.constant: for href in hrefs: folder_watch(folder) dl_threads(folder, url, log_enabled) else: while True: folder_watch(folder) dl_threads(folder, url) sleep(60) print('Sayonara')