From bd16dc7a6eb82eeeb627d52f8a4811c40e087f56 Mon Sep 17 00:00:00 2001 From: Justine Date: Sun, 7 Feb 2021 22:46:38 +0100 Subject: [PATCH] threadfinder --- 4chanthreadfinder.py | 144 +++++++++++++++++++++++++++++++++++++++++++ README.md | 22 ++++++- scrape.py | 2 +- 3 files changed, 166 insertions(+), 2 deletions(-) create mode 100755 4chanthreadfinder.py diff --git a/4chanthreadfinder.py b/4chanthreadfinder.py new file mode 100755 index 0000000..90d3a94 --- /dev/null +++ b/4chanthreadfinder.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +#coding: utf8 + +#Scraper +from bs4 import BeautifulSoup +#Connexion web +import mechanicalsoup +import wget +from os import mkdir, listdir +from re import findall +''' +############## 4chan thread scrapper ################ +here we look for particular threads on 4chan and dl their images +''' + + +def getArgs(): + '''Gets all the arguments passed to the script and returns them in a parse_args()-type object. + No args + Returns: + -args : an args object containing all the optional arguments passed to the script. + ''' + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) + parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) + parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) + + #Creating the args object + args=parser.parse_args() + + return args + +def thread_finder(homepage, keyword): + ''' + returns a list of all the threads where op said keyword on 4chan homepage + Args: + - homepage: bs4 soup object containing html from the homepage of the board + - keyword : any single word + Returns: + - hrefs : all the references to matching threads. They must be appended to homepage to work + ''' + href = homepage.findAll('a', {'title': 'Reply to this post'}) + threads = homepage.findAll('div', {'class': 'thread'}) + hrefs = [] + + for thread in threads: + texts = thread.findAll('blockquote', {'class' : 'postMessage'}) + for text in texts: + if keyword.lower() in text.text.lower(): + links = thread.findAll('a', {'title': 'Reply to this post'}) + for link in links: + hrefs.append(f"{link['href']}") + return hrefs + + + + +def html_get(url): + ''' + Get html from the webpage + Args: + - url : a str containing url to scrap + Returns: + - page.soup: A BeautifulSoup object containing html + ''' + #Browser + browser = mechanicalsoup.Browser() + try: + page = browser.get(url) + print(f"Got html from {url}") + return page.soup + except Exception as e: + print(f"Got error {e}") + + +def scraper(soup): + ''' + Scrape a bs4 html object, find posts w/ images and get full size source + Args: + - soup : bs4 soup item + - item : tag to find + Returns: + - sources : A list of image sources + ''' + tags=soup.findAll('a', {"class": "fileThumb"}) + sources = [] + for tag in tags: + sources.append(tag['href']) + + #Got image sources, removing any left http headers + for index, item in enumerate(sources): + sources[index] = item.replace("http://", "") + sources[index] = item.replace("https://", "") + sources[index] = item.replace("//", "") + return sources + + +def item_dl(sources, dlfolder): + ''' + Download all items in the sources list to folder dlfolder, which we try to create" + Args: + - sources : a list of URLs + ''' + #Making folder + try: + mkdir(dlfolder) + except FileExistsError: + print(f"{dlfolder} already exists, not creating") + + #Deduplicating + imagenames = [] + dir_content = listdir(dlfolder) + for index,source in enumerate(sources): + imagename = findall(r"[^\/]*$", source)[0] + if imagename in dir_content: + sources.pop(index) + print(f"Found duplicate {imagename}") + + for source in sources: + fullsource = "http://" + source + name = wget.download(fullsource, out=dlfolder) + print(f"{name} downloaded") + + +#Getting main elements +args = getArgs() +folder = args.folder +keyword = args.keyword +url = args.url +soup = html_get(url) +hrefs = thread_finder(soup, keyword) +sources = scraper(soup) +#item_dl(sources, folder) + +#Dling all threads found + +for href in hrefs: + print(f"going after {url}{href}") + subsoup = html_get(f"{url}{href}") + subsources = scraper(subsoup) + print(subsources) + item_dl(subsources, folder) diff --git a/README.md b/README.md index c88a73a..cb48db7 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ # Scrappers Two scrappers: -* The 4chan one dls all images from a thread in best res +* The 4chancrape one dls all images from a thread in best res * The other one simply looks for "img" in any given page and downloads images +* 4chanthreadfinder looks for a keyword in thread names, and dls all images from relevant threads +## 4chanscrape, imgscrape Install depedencies: ``` @@ -18,3 +20,21 @@ Use: * -u : URL of the page * -f : folder where you want to download all pictures + +## 4chanthreadfinder +Install depedencies: + +``` +python3 -m pip install beautifulsoup4 mechanicalsoup wget --user +``` + +Use: + +``` +./4chanthreadfinder.py -u https://boards.4chan.org/b/ -f ./downloads/thread -k 'ylyl thread' +``` + +* -u : URL of the page +* -f : folder where you want to download all pictures +* -k : keyword or keyphrase to search (better use a single word !) + diff --git a/scrape.py b/scrape.py index c4240c8..5e791f5 100755 --- a/scrape.py +++ b/scrape.py @@ -60,7 +60,7 @@ def scraper(soup): tags=soup.findAll('img') sources = [] for tag in tags: - sources.append(tag['href']) + sources.append(tag['src']) #Got image sources, removing any left http headers for index, item in enumerate(sources):