threadfinder
This commit is contained in:
parent
6d08071c6c
commit
bd16dc7a6e
144
4chanthreadfinder.py
Executable file
144
4chanthreadfinder.py
Executable file
@ -0,0 +1,144 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#coding: utf8
|
||||||
|
|
||||||
|
#Scraper
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
#Connexion web
|
||||||
|
import mechanicalsoup
|
||||||
|
import wget
|
||||||
|
from os import mkdir, listdir
|
||||||
|
from re import findall
|
||||||
|
'''
|
||||||
|
############## 4chan thread scrapper ################
|
||||||
|
here we look for particular threads on 4chan and dl their images
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def getArgs():
|
||||||
|
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
||||||
|
No args
|
||||||
|
Returns:
|
||||||
|
-args : an args object containing all the optional arguments passed to the script.
|
||||||
|
'''
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
|
||||||
|
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
|
||||||
|
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
|
||||||
|
|
||||||
|
#Creating the args object
|
||||||
|
args=parser.parse_args()
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
def thread_finder(homepage, keyword):
|
||||||
|
'''
|
||||||
|
returns a list of all the threads where op said keyword on 4chan homepage
|
||||||
|
Args:
|
||||||
|
- homepage: bs4 soup object containing html from the homepage of the board
|
||||||
|
- keyword : any single word
|
||||||
|
Returns:
|
||||||
|
- hrefs : all the references to matching threads. They must be appended to homepage to work
|
||||||
|
'''
|
||||||
|
href = homepage.findAll('a', {'title': 'Reply to this post'})
|
||||||
|
threads = homepage.findAll('div', {'class': 'thread'})
|
||||||
|
hrefs = []
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
|
||||||
|
for text in texts:
|
||||||
|
if keyword.lower() in text.text.lower():
|
||||||
|
links = thread.findAll('a', {'title': 'Reply to this post'})
|
||||||
|
for link in links:
|
||||||
|
hrefs.append(f"{link['href']}")
|
||||||
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def html_get(url):
|
||||||
|
'''
|
||||||
|
Get html from the webpage
|
||||||
|
Args:
|
||||||
|
- url : a str containing url to scrap
|
||||||
|
Returns:
|
||||||
|
- page.soup: A BeautifulSoup object containing html
|
||||||
|
'''
|
||||||
|
#Browser
|
||||||
|
browser = mechanicalsoup.Browser()
|
||||||
|
try:
|
||||||
|
page = browser.get(url)
|
||||||
|
print(f"Got html from {url}")
|
||||||
|
return page.soup
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Got error {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def scraper(soup):
|
||||||
|
'''
|
||||||
|
Scrape a bs4 html object, find posts w/ images and get full size source
|
||||||
|
Args:
|
||||||
|
- soup : bs4 soup item
|
||||||
|
- item : tag to find
|
||||||
|
Returns:
|
||||||
|
- sources : A list of image sources
|
||||||
|
'''
|
||||||
|
tags=soup.findAll('a', {"class": "fileThumb"})
|
||||||
|
sources = []
|
||||||
|
for tag in tags:
|
||||||
|
sources.append(tag['href'])
|
||||||
|
|
||||||
|
#Got image sources, removing any left http headers
|
||||||
|
for index, item in enumerate(sources):
|
||||||
|
sources[index] = item.replace("http://", "")
|
||||||
|
sources[index] = item.replace("https://", "")
|
||||||
|
sources[index] = item.replace("//", "")
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def item_dl(sources, dlfolder):
|
||||||
|
'''
|
||||||
|
Download all items in the sources list to folder dlfolder, which we try to create"
|
||||||
|
Args:
|
||||||
|
- sources : a list of URLs
|
||||||
|
'''
|
||||||
|
#Making folder
|
||||||
|
try:
|
||||||
|
mkdir(dlfolder)
|
||||||
|
except FileExistsError:
|
||||||
|
print(f"{dlfolder} already exists, not creating")
|
||||||
|
|
||||||
|
#Deduplicating
|
||||||
|
imagenames = []
|
||||||
|
dir_content = listdir(dlfolder)
|
||||||
|
for index,source in enumerate(sources):
|
||||||
|
imagename = findall(r"[^\/]*$", source)[0]
|
||||||
|
if imagename in dir_content:
|
||||||
|
sources.pop(index)
|
||||||
|
print(f"Found duplicate {imagename}")
|
||||||
|
|
||||||
|
for source in sources:
|
||||||
|
fullsource = "http://" + source
|
||||||
|
name = wget.download(fullsource, out=dlfolder)
|
||||||
|
print(f"{name} downloaded")
|
||||||
|
|
||||||
|
|
||||||
|
#Getting main elements
|
||||||
|
args = getArgs()
|
||||||
|
folder = args.folder
|
||||||
|
keyword = args.keyword
|
||||||
|
url = args.url
|
||||||
|
soup = html_get(url)
|
||||||
|
hrefs = thread_finder(soup, keyword)
|
||||||
|
sources = scraper(soup)
|
||||||
|
#item_dl(sources, folder)
|
||||||
|
|
||||||
|
#Dling all threads found
|
||||||
|
|
||||||
|
for href in hrefs:
|
||||||
|
print(f"going after {url}{href}")
|
||||||
|
subsoup = html_get(f"{url}{href}")
|
||||||
|
subsources = scraper(subsoup)
|
||||||
|
print(subsources)
|
||||||
|
item_dl(subsources, folder)
|
22
README.md
22
README.md
@ -1,9 +1,11 @@
|
|||||||
# Scrappers
|
# Scrappers
|
||||||
|
|
||||||
Two scrappers:
|
Two scrappers:
|
||||||
* The 4chan one dls all images from a thread in best res
|
* The 4chancrape one dls all images from a thread in best res
|
||||||
* The other one simply looks for "img" in any given page and downloads images
|
* The other one simply looks for "img" in any given page and downloads images
|
||||||
|
* 4chanthreadfinder looks for a keyword in thread names, and dls all images from relevant threads
|
||||||
|
|
||||||
|
## 4chanscrape, imgscrape
|
||||||
Install depedencies:
|
Install depedencies:
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -18,3 +20,21 @@ Use:
|
|||||||
|
|
||||||
* -u : URL of the page
|
* -u : URL of the page
|
||||||
* -f : folder where you want to download all pictures
|
* -f : folder where you want to download all pictures
|
||||||
|
|
||||||
|
## 4chanthreadfinder
|
||||||
|
Install depedencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 -m pip install beautifulsoup4 mechanicalsoup wget --user
|
||||||
|
```
|
||||||
|
|
||||||
|
Use:
|
||||||
|
|
||||||
|
```
|
||||||
|
./4chanthreadfinder.py -u https://boards.4chan.org/b/ -f ./downloads/thread -k 'ylyl thread'
|
||||||
|
```
|
||||||
|
|
||||||
|
* -u : URL of the page
|
||||||
|
* -f : folder where you want to download all pictures
|
||||||
|
* -k : keyword or keyphrase to search (better use a single word !)
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ def scraper(soup):
|
|||||||
tags=soup.findAll('img')
|
tags=soup.findAll('img')
|
||||||
sources = []
|
sources = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
sources.append(tag['href'])
|
sources.append(tag['src'])
|
||||||
|
|
||||||
#Got image sources, removing any left http headers
|
#Got image sources, removing any left http headers
|
||||||
for index, item in enumerate(sources):
|
for index, item in enumerate(sources):
|
||||||
|
Reference in New Issue
Block a user