This repository has been archived on 2021-12-02. You can view files and clone it, but cannot push or open issues or pull requests.
ImageScrapper/4chanthreadfinder.py
justine e06953ccf6 Update '4chanthreadfinder.py'
Change loglevel to INFO as not be flooded by mechanicalSoup
2021-02-19 11:00:15 +00:00

256 lines
7.0 KiB
Python
Executable File

#!/usr/bin/env python3
#coding: utf8
#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir, listdir
from sys import path
from re import findall
from time import sleep
from threading import Thread
import logging
'''
############## 4chan thread scrapper ################
This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
. \\
,` ( ` SquiP
( \' "
`-.__)_
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
Returns:
-args : an args object containing all the optional arguments passed to the script.
'''
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
#Creating the args object
args=parser.parse_args()
return args
def thread_finder(homepage, keyword):
'''
returns a list of all the threads where op said keyword on 4chan homepage
Args:
- homepage: bs4 soup object containing html from the homepage of the board
- keyword : list of expressions to look for
Returns:
- hrefs : all the references to matching threads. They must be appended to homepage to work
'''
href = homepage.findAll('a', {'title': 'Reply to this post'})
threads = homepage.findAll('div', {'class': 'thread'})
hrefs = []
for thread in threads:
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
for text in texts:
for word in keyword:
if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
print(f"Found {word}")
links = thread.findAll('a', {'title': 'Reply to this post'})
for link in links:
hrefs.append(f"{link['href']}")
return hrefs
def html_get(url):
'''
Get html from the webpage
Args:
- url : a str containing url to scrap
Returns:
- page.soup: A BeautifulSoup object containing html
'''
#Browser
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
return page.soup
except Exception as e:
print(f"Got error {e}")
def scraper(soup):
'''
Scrape a bs4 html object, find posts w/ images and get full size source
Args:
- soup : bs4 soup item
- item : tag to find
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
sources[index] = item.replace("http://", "")
sources[index] = item.replace("https://", "")
sources[index] = item.replace("//", "")
return sources
def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder, which we try to create"
Args:
- sources : a list of URLsi
- global folder_content : see folder_watch()
'''
global folder_content
#Deduplicating
imagenames = []
for source in sources:
fullsource = "http://" + source
imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded from {source}")
return True
def folder_create(dlfolder):
'''
Create the folder if it does not exist
Args:
- dlfolder : path of folder to create
'''
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
def folder_watch(folder):
'''
Watch for the content of a folder and return its content.
Content is a string containing all the names of all the elements.
Args:
- folder : folder to watch
- global folder_content : see folder_watch()
Returns:
folder_content : said string, containing all the names of all the files in the folder
'''
global folder_content
folder_list = listdir(folder)
folder_content = ""
for i in folder_list:
folder_content += i
def dl_threads(folder, url, log_enabled):
'''
Constantly download...
Args:
- folder: folder to dl into
- url : board to watch
- log_enabled : Set True if logging lib is used
'''
try:
sleep(2)
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
#item_dl(sources, folder)
#Dling all threads found
#oneshot
for href in hrefs:
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
folder_watch(folder)
try:
item_dl(subsources, folder)
if log_enabled:
logging.info(f"Downloaded {url}{href}")
sleep(2)
except HTTPSConnectionPool as ConnErr:
if log_enabled:
logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
else:
print(f"Got Error {ConErr}, pipes must be clogged lulz")
except Exception as e:
print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
#Getting main elements
args = getArgs()
folder = args.folder
keyword = args.keyword
if args.logfile:
logfile = args.logfile
#Creating Logfile
logging.basicConfig(\
format='%(asctime)s %(levelname)-8s %(message)s',\
filename=f"{path[0]}/{args.logfile}",\
level = logging.INFO,\
datefmt='%Y-%m-%d %H:%M:%S'\
)
log_enabled = True
else:
log_enabled = False
url = args.url
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
folder_create(folder)
folder_content = ""
print("Lurking...")
#item_dl(sources, folder)
#Dling all threads found
#oneshot
if not args.constant:
for href in hrefs:
folder_watch(folder)
dl_threads(folder, url, log_enabled)
else:
while True:
folder_watch(folder)
dl_threads(folder, url)
sleep(60)
print('Sayonara')