Compare commits
11 Commits
eca7bed6f8
...
master
Author | SHA1 | Date | |
---|---|---|---|
ca78d58458 | |||
b2e7deb058 | |||
e06953ccf6 | |||
77c20d67f1 | |||
52fdd4f4b1 | |||
a97067d452 | |||
aa67b222d8 | |||
5f31ab9fcc | |||
913208274b | |||
85a798b311 | |||
2a5eb235e4 |
Binary file not shown.
Before Width: | Height: | Size: 88 KiB |
@ -7,15 +7,28 @@ from bs4 import BeautifulSoup
|
|||||||
import mechanicalsoup
|
import mechanicalsoup
|
||||||
import wget
|
import wget
|
||||||
from os import mkdir, listdir
|
from os import mkdir, listdir
|
||||||
|
from sys import path
|
||||||
from re import findall
|
from re import findall
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
import logging
|
||||||
|
|
||||||
'''
|
'''
|
||||||
############## 4chan thread scrapper ################
|
############## 4chan thread scrapper ################
|
||||||
here we look for particular threads on 4chan and dl their images
|
This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
|
||||||
|
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
|
||||||
|
|
||||||
|
|
||||||
|
. \\
|
||||||
|
,` ( ` SquiP
|
||||||
|
( \' "
|
||||||
|
`-.__)_
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getArgs():
|
def getArgs():
|
||||||
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
||||||
No args
|
No args
|
||||||
@ -27,9 +40,9 @@ def getArgs():
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
|
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
|
||||||
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
|
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
|
||||||
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
|
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
|
||||||
|
parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
|
||||||
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
|
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
|
||||||
parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
|
|
||||||
|
|
||||||
#Creating the args object
|
#Creating the args object
|
||||||
args=parser.parse_args()
|
args=parser.parse_args()
|
||||||
@ -41,7 +54,7 @@ def thread_finder(homepage, keyword):
|
|||||||
returns a list of all the threads where op said keyword on 4chan homepage
|
returns a list of all the threads where op said keyword on 4chan homepage
|
||||||
Args:
|
Args:
|
||||||
- homepage: bs4 soup object containing html from the homepage of the board
|
- homepage: bs4 soup object containing html from the homepage of the board
|
||||||
- keyword : any single word
|
- keyword : list of expressions to look for
|
||||||
Returns:
|
Returns:
|
||||||
- hrefs : all the references to matching threads. They must be appended to homepage to work
|
- hrefs : all the references to matching threads. They must be appended to homepage to work
|
||||||
'''
|
'''
|
||||||
@ -52,11 +65,13 @@ def thread_finder(homepage, keyword):
|
|||||||
for thread in threads:
|
for thread in threads:
|
||||||
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
|
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
|
||||||
for text in texts:
|
for text in texts:
|
||||||
if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
|
for word in keyword:
|
||||||
links = thread.findAll('a', {'title': 'Reply to this post'})
|
if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
|
||||||
for link in links:
|
print(f"Found {word}")
|
||||||
hrefs.append(f"{link['href']}")
|
links = thread.findAll('a', {'title': 'Reply to this post'})
|
||||||
return hrefs
|
for link in links:
|
||||||
|
hrefs.append(f"{link['href']}")
|
||||||
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -73,7 +88,6 @@ def html_get(url):
|
|||||||
browser = mechanicalsoup.Browser()
|
browser = mechanicalsoup.Browser()
|
||||||
try:
|
try:
|
||||||
page = browser.get(url)
|
page = browser.get(url)
|
||||||
print(f"Got html from {url}")
|
|
||||||
return page.soup
|
return page.soup
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Got error {e}")
|
print(f"Got error {e}")
|
||||||
@ -105,35 +119,70 @@ def item_dl(sources, dlfolder):
|
|||||||
'''
|
'''
|
||||||
Download all items in the sources list to folder dlfolder, which we try to create"
|
Download all items in the sources list to folder dlfolder, which we try to create"
|
||||||
Args:
|
Args:
|
||||||
- sources : a list of URLs
|
- sources : a list of URLsi
|
||||||
'''
|
- global folder_content : see folder_watch()
|
||||||
#Making folder
|
|
||||||
try:
|
|
||||||
mkdir(dlfolder)
|
|
||||||
except FileExistsError:
|
|
||||||
print(f"{dlfolder} already exists, not creating")
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
global folder_content
|
||||||
|
|
||||||
#Deduplicating
|
#Deduplicating
|
||||||
imagenames = []
|
imagenames = []
|
||||||
dir_content = listdir(dlfolder)
|
|
||||||
|
|
||||||
for source in sources:
|
for source in sources:
|
||||||
fullsource = "http://" + source
|
fullsource = "http://" + source
|
||||||
imagename = findall(r"[^\/]*$", source)[0]
|
imagename = findall(r"[^\/]*$", source)[0]
|
||||||
if imagename[:-4] not in str(dir_content):
|
if imagename[:-4] not in folder_content:
|
||||||
name = wget.download(fullsource, out=dlfolder)
|
name = wget.download(fullsource, out=dlfolder)
|
||||||
print(f"{name} downloaded")
|
print(f"{name} downloaded from {source}")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def constant_dl(folder, url):
|
def folder_create(dlfolder):
|
||||||
|
'''
|
||||||
|
Create the folder if it does not exist
|
||||||
|
Args:
|
||||||
|
- dlfolder : path of folder to create
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
#Making folder
|
||||||
|
mkdir(dlfolder)
|
||||||
|
except FileExistsError:
|
||||||
|
print(f"{dlfolder} already exists, not creating")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def folder_watch(folder):
|
||||||
|
'''
|
||||||
|
Watch for the content of a folder and return its content.
|
||||||
|
Content is a string containing all the names of all the elements.
|
||||||
|
Args:
|
||||||
|
- folder : folder to watch
|
||||||
|
- global folder_content : see folder_watch()
|
||||||
|
Returns:
|
||||||
|
folder_content : said string, containing all the names of all the files in the folder
|
||||||
|
'''
|
||||||
|
|
||||||
|
global folder_content
|
||||||
|
|
||||||
|
folder_list = listdir(folder)
|
||||||
|
folder_content = ""
|
||||||
|
|
||||||
|
for i in folder_list:
|
||||||
|
folder_content += i
|
||||||
|
|
||||||
|
|
||||||
|
def dl_threads(folder, url, log_enabled):
|
||||||
'''
|
'''
|
||||||
Constantly download...
|
Constantly download...
|
||||||
Args:
|
Args:
|
||||||
- folder: folder to dl into
|
- folder: folder to dl into
|
||||||
- url : board to watch
|
- url : board to watch
|
||||||
|
- log_enabled : Set True if logging lib is used
|
||||||
'''
|
'''
|
||||||
while True:
|
|
||||||
|
try:
|
||||||
sleep(2)
|
sleep(2)
|
||||||
soup = html_get(url)
|
soup = html_get(url)
|
||||||
hrefs = thread_finder(soup, keyword)
|
hrefs = thread_finder(soup, keyword)
|
||||||
@ -144,12 +193,21 @@ def constant_dl(folder, url):
|
|||||||
|
|
||||||
#oneshot
|
#oneshot
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
print(f"going after {url}{href}")
|
|
||||||
subsoup = html_get(f"{url}{href}")
|
subsoup = html_get(f"{url}{href}")
|
||||||
subsources = scraper(subsoup)
|
subsources = scraper(subsoup)
|
||||||
print(subsources)
|
folder_watch(folder)
|
||||||
item_dl(subsources, folder)
|
try:
|
||||||
|
item_dl(subsources, folder)
|
||||||
|
if log_enabled:
|
||||||
|
logging.info(f"Downloaded {url}{href}")
|
||||||
|
sleep(2)
|
||||||
|
except HTTPSConnectionPool as ConnErr:
|
||||||
|
if log_enabled:
|
||||||
|
logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
|
||||||
|
else:
|
||||||
|
print(f"Got Error {ConErr}, pipes must be clogged lulz")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -157,15 +215,29 @@ def constant_dl(folder, url):
|
|||||||
args = getArgs()
|
args = getArgs()
|
||||||
folder = args.folder
|
folder = args.folder
|
||||||
keyword = args.keyword
|
keyword = args.keyword
|
||||||
if args.threads:
|
if args.logfile:
|
||||||
threadnumber = args.threads
|
logfile = args.logfile
|
||||||
|
|
||||||
|
#Creating Logfile
|
||||||
|
logging.basicConfig(\
|
||||||
|
format='%(asctime)s %(levelname)-8s %(message)s',\
|
||||||
|
filename=f"{path[0]}/{args.logfile}",\
|
||||||
|
level = logging.INFO,\
|
||||||
|
datefmt='%Y-%m-%d %H:%M:%S'\
|
||||||
|
)
|
||||||
|
log_enabled = True
|
||||||
else:
|
else:
|
||||||
threadnumber = 2
|
log_enabled = False
|
||||||
|
|
||||||
|
|
||||||
url = args.url
|
url = args.url
|
||||||
soup = html_get(url)
|
soup = html_get(url)
|
||||||
hrefs = thread_finder(soup, keyword)
|
hrefs = thread_finder(soup, keyword)
|
||||||
sources = scraper(soup)
|
sources = scraper(soup)
|
||||||
|
folder_create(folder)
|
||||||
|
folder_content = ""
|
||||||
|
|
||||||
|
print("Lurking...")
|
||||||
#item_dl(sources, folder)
|
#item_dl(sources, folder)
|
||||||
|
|
||||||
#Dling all threads found
|
#Dling all threads found
|
||||||
@ -173,16 +245,11 @@ sources = scraper(soup)
|
|||||||
#oneshot
|
#oneshot
|
||||||
if not args.constant:
|
if not args.constant:
|
||||||
for href in hrefs:
|
for href in hrefs:
|
||||||
print(f"going after {url}{href}")
|
folder_watch(folder)
|
||||||
subsoup = html_get(f"{url}{href}")
|
dl_threads(folder, url, log_enabled)
|
||||||
subsources = scraper(subsoup)
|
|
||||||
print(subsources)
|
|
||||||
item_dl(subsources, folder)
|
|
||||||
else:
|
else:
|
||||||
thread_objects = []
|
while True:
|
||||||
for i in range (1, threadnumber):
|
folder_watch(folder)
|
||||||
thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
|
dl_threads(folder, url)
|
||||||
for thread in thread_objects:
|
sleep(60)
|
||||||
thread.start()
|
print('Sayonara')
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,11 +46,8 @@ Use (constant, multi-threaded):
|
|||||||
* -f : folder where you want to download all pictures
|
* -f : folder where you want to download all pictures
|
||||||
* -k : keyword or keyphrase to search (better use a single word !)
|
* -k : keyword or keyphrase to search (better use a single word !)
|
||||||
* -c : constant : enables constant downloading
|
* -c : constant : enables constant downloading
|
||||||
* -t 3 : number of threads. Here, 3 threads keep on running to constantly download
|
|
||||||
|
|
||||||
##Todo
|
##Todo
|
||||||
* Filter by filetype
|
* Filter by filetype
|
||||||
* Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
|
|
||||||
* Use a try / catch when dling since some threads go 404 and it gives us a crash
|
|
||||||
* Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)
|
* Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)
|
||||||
|
|
||||||
|
36
scrape.py
36
scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
|
|||||||
#Connexion web
|
#Connexion web
|
||||||
import mechanicalsoup
|
import mechanicalsoup
|
||||||
import wget
|
import wget
|
||||||
from os import mkdir
|
from os import mkdir, listdir
|
||||||
'''
|
'''
|
||||||
############## 4chan thread scrapper ################
|
############## 4chan thread scrapper ################
|
||||||
Give me a page, i'll download all of its pictures !
|
Give me a 4chan thread, i'll download all of its pictures !
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
@ -57,10 +57,10 @@ def scraper(soup):
|
|||||||
Returns:
|
Returns:
|
||||||
- sources : A list of image sources
|
- sources : A list of image sources
|
||||||
'''
|
'''
|
||||||
tags=soup.findAll('img')
|
tags=soup.findAll('a', {"class": "fileThumb"})
|
||||||
sources = []
|
sources = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
sources.append(tag['src'])
|
sources.append(tag['href'])
|
||||||
|
|
||||||
#Got image sources, removing any left http headers
|
#Got image sources, removing any left http headers
|
||||||
for index, item in enumerate(sources):
|
for index, item in enumerate(sources):
|
||||||
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
|
|||||||
|
|
||||||
for source in sources:
|
for source in sources:
|
||||||
fullsource = "http://" + source
|
fullsource = "http://" + source
|
||||||
name = wget.download(fullsource, out=dlfolder)
|
if dlfolder[-1] == "/":
|
||||||
print(f"{name} downloaded")
|
dlfolder = dlfolder[:-1]
|
||||||
|
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
|
||||||
|
name = wget.download(fullsource, out=dlfolder)
|
||||||
|
print(f"{name} downloaded")
|
||||||
|
else:
|
||||||
|
print(f"{source} is already there")
|
||||||
|
|
||||||
|
def deduplicate(url, folder):
|
||||||
|
'''
|
||||||
|
Takes a url to an image and a folder, check if said image exists in said folder
|
||||||
|
Args:
|
||||||
|
- url : a str containing a full url to an image
|
||||||
|
- folder : Name of a folder in /Images
|
||||||
|
Returns:
|
||||||
|
- True : The image is already in the folder
|
||||||
|
- False : The image is not in the folder
|
||||||
|
'''
|
||||||
|
image_name = url.split("/")[-1]
|
||||||
|
image_name = image_name.split("?")[0]
|
||||||
|
files = listdir(f"/Images/{folder}")
|
||||||
|
|
||||||
|
for i in files:
|
||||||
|
if i == image_name:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
args = getArgs()
|
args = getArgs()
|
||||||
|
Reference in New Issue
Block a user