ImageScrapper/4chanthreadfinder.py

#!/usr/bin/env python3
#coding: utf8

#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir, listdir
from sys import path
from re import findall
from time import sleep
from threading import Thread
import logging

'''
############## 4chan thread scrapper ################
This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !


   .  \\
   ,` ( `   SquiP
  (  \' "
   `-.__)_

'''


def getArgs():
    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
    No args
    Returns:
    -args : an args object containing all the optional arguments passed to the script.
    '''
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
    parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")

    #Creating the args object
    args=parser.parse_args()

    return args

def thread_finder(homepage, keyword):
    '''
    returns a list of all the threads where op said keyword on 4chan homepage
    Args:
    - homepage: bs4 soup object containing html from the homepage of the board
    - keyword : list of expressions to look for
    Returns:
    - hrefs : all the references to matching threads. They must be appended to homepage to work
    '''
    href = homepage.findAll('a', {'title': 'Reply to this post'})
    threads = homepage.findAll('div', {'class': 'thread'})
    hrefs = []

    for thread in threads:
        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
        for text in texts:
            for word in keyword:
                if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
                    print(f"Found {word}")
                    links = thread.findAll('a', {'title': 'Reply to this post'})
                    for link in links:
                        hrefs.append(f"{link['href']}")
        return hrefs


def html_get(url):
    '''
    Get html from the webpage
    Args:
    - url : a str containing url to scrap
    Returns:
    - page.soup: A BeautifulSoup object containing html
    '''
    #Browser
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
        return page.soup
    except Exception as e:
        print(f"Got error {e}")


def scraper(soup):
    '''
    Scrape a bs4 html object, find posts w/ images and get full size source
    Args:
    - soup : bs4 soup item
    - item : tag to find
    Returns:
    - sources : A list of image sources
    '''
    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
        sources.append(tag['href'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
        sources[index] = item.replace("http://", "")
        sources[index] = item.replace("https://", "")
        sources[index] = item.replace("//", "")
    return sources


def item_dl(sources, dlfolder):
    '''
    Download all items in the sources list to folder dlfolder, which we try to create"
    Args:
    - sources : a list of URLsi
    - global folder_content : see folder_watch()

    '''

    global folder_content

    #Deduplicating
    imagenames = []

    for source in sources:
        fullsource = "http://" + source
        imagename = findall(r"[^\/]*$", source)[0]
        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
            print(f"{name} downloaded from {source}")

    return True

def folder_create(dlfolder):
    '''
    Create the folder if it does not exist
    Args:
    - dlfolder : path of folder to create
    '''

    try:
    #Making folder
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")


def folder_watch(folder):
    '''
    Watch for the content of a folder and return its content.
    Content is a string containing all the names of all the elements.
    Args:
    - folder : folder to watch
    - global folder_content : see folder_watch()
    Returns:
    folder_content : said string, containing all the names of all the files in the folder
    '''

    global folder_content

    folder_list = listdir(folder)
    folder_content = ""

    for i in folder_list:
        folder_content += i


def dl_threads(folder, url, log_enabled):
    '''
    Constantly download...
    Args:
    - folder: folder to dl into
    - url : board to watch
    - log_enabled : Set True if logging lib is used
    '''

    try:
        sleep(2)
        soup = html_get(url)
        hrefs = thread_finder(soup, keyword)
        sources = scraper(soup)
        #item_dl(sources, folder)

        #Dling all threads found

        #oneshot
        for href in hrefs:
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
            folder_watch(folder)
            try:
                item_dl(subsources, folder)
                if log_enabled:
                    logging.info(f"Downloaded {url}{href}")
                sleep(2)
            except HTTPSConnectionPool as ConnErr:
                if log_enabled:
                    logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
                else:
                    print(f"Got Error {ConErr}, pipes must be clogged lulz")
    except Exception as e:
        print(f"Houston, we had a problem with {url} and {folder}: \n{e}")


#Getting main elements
args = getArgs()
folder = args.folder
keyword = args.keyword
if args.logfile:
    logfile = args.logfile

    #Creating Logfile
    logging.basicConfig(\
            format='%(asctime)s %(levelname)-8s %(message)s',\
            filename=f"{path[0]}/{args.logfile}",\
            level = logging.INFO,\
            datefmt='%Y-%m-%d %H:%M:%S'\
            )
    log_enabled = True
else:
    log_enabled = False


url = args.url
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
folder_create(folder)
folder_content = ""

print("Lurking...")
#item_dl(sources, folder)

#Dling all threads found

#oneshot
if not args.constant:
    for href in hrefs:
        folder_watch(folder)
        dl_threads(folder, url, log_enabled)
else:
    while True:
            folder_watch(folder)
            dl_threads(folder, url)
            sleep(60)
print('Sayonara')