Compare commits

...

7 Commits

Author SHA1 Message Date
ca78d58458 Update 'scrape.py'
Fixed dedupe
2021-08-12 00:54:29 +00:00
b2e7deb058 Update 'scrape.py'
Added dedup
2021-08-12 00:26:36 +00:00
e06953ccf6 Update '4chanthreadfinder.py'
Change loglevel to INFO as not be flooded by mechanicalSoup
2021-02-19 11:00:15 +00:00
77c20d67f1 Update '4chanthreadfinder.py'
Added logging
2021-02-19 10:52:22 +00:00
52fdd4f4b1 Update '4chanthreadfinder.py'
Added multiple keywords support
2021-02-10 14:26:26 +00:00
a97067d452 Update '4chanthreadfinder.py'
Bugfixes
2021-02-09 20:19:48 +00:00
aa67b222d8 menage 2021-02-09 21:49:12 +01:00
14 changed files with 91 additions and 28 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 376 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 920 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

View File

@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
import mechanicalsoup
import wget
from os import mkdir, listdir
from sys import path
from re import findall
from time import sleep
from threading import Thread
import logging
'''
############## 4chan thread scrapper ################
@ -38,7 +40,8 @@ def getArgs():
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
#Creating the args object
@ -51,7 +54,7 @@ def thread_finder(homepage, keyword):
returns a list of all the threads where op said keyword on 4chan homepage
Args:
- homepage: bs4 soup object containing html from the homepage of the board
- keyword : any single word
- keyword : list of expressions to look for
Returns:
- hrefs : all the references to matching threads. They must be appended to homepage to work
'''
@ -62,11 +65,13 @@ def thread_finder(homepage, keyword):
for thread in threads:
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
for text in texts:
if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
links = thread.findAll('a', {'title': 'Reply to this post'})
for link in links:
hrefs.append(f"{link['href']}")
return hrefs
for word in keyword:
if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
print(f"Found {word}")
links = thread.findAll('a', {'title': 'Reply to this post'})
for link in links:
hrefs.append(f"{link['href']}")
return hrefs
@ -83,7 +88,6 @@ def html_get(url):
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
print(f"Got html from {url}")
return page.soup
except Exception as e:
print(f"Got error {e}")
@ -122,12 +126,6 @@ def item_dl(sources, dlfolder):
global folder_content
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
#Deduplicating
imagenames = []
@ -136,10 +134,25 @@ def item_dl(sources, dlfolder):
imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
print(f"{name} downloaded from {source}")
return True
def folder_create(dlfolder):
'''
Create the folder if it does not exist
Args:
- dlfolder : path of folder to create
'''
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
def folder_watch(folder):
'''
Watch for the content of a folder and return its content.
@ -160,12 +173,13 @@ def folder_watch(folder):
folder_content += i
def dl_threads(folder, url):
def dl_threads(folder, url, log_enabled):
'''
Constantly download...
Args:
- folder: folder to dl into
- url : board to watch
- log_enabled : Set True if logging lib is used
'''
try:
@ -179,13 +193,21 @@ def dl_threads(folder, url):
#oneshot
for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
folder_watch(folder)
item_dl(subsources, folder)
try:
item_dl(subsources, folder)
if log_enabled:
logging.info(f"Downloaded {url}{href}")
sleep(2)
except HTTPSConnectionPool as ConnErr:
if log_enabled:
logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
else:
print(f"Got Error {ConErr}, pipes must be clogged lulz")
except Exception as e:
print(f"Houston, we had a problem: \n{e}")
print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
@ -193,12 +215,29 @@ def dl_threads(folder, url):
args = getArgs()
folder = args.folder
keyword = args.keyword
if args.logfile:
logfile = args.logfile
#Creating Logfile
logging.basicConfig(\
format='%(asctime)s %(levelname)-8s %(message)s',\
filename=f"{path[0]}/{args.logfile}",\
level = logging.INFO,\
datefmt='%Y-%m-%d %H:%M:%S'\
)
log_enabled = True
else:
log_enabled = False
url = args.url
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
folder_create(folder)
folder_content = ""
print("Lurking...")
#item_dl(sources, folder)
#Dling all threads found
@ -207,10 +246,10 @@ folder_content = ""
if not args.constant:
for href in hrefs:
folder_watch(folder)
dl_threads(folder_url)
dl_threads(folder, url, log_enabled)
else:
while True:
folder_watch(folder)
dl_threads(folder, url)
sleep(60)
print('Sayonara')

View File

@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir
from os import mkdir, listdir
'''
############## 4chan thread scrapper ################
Give me a page, i'll download all of its pictures !
Give me a 4chan thread, i'll download all of its pictures !
'''
@ -57,10 +57,10 @@ def scraper(soup):
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('img')
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['src'])
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
for source in sources:
fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
if dlfolder[-1] == "/":
dlfolder = dlfolder[:-1]
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
else:
print(f"{source} is already there")
def deduplicate(url, folder):
'''
Takes a url to an image and a folder, check if said image exists in said folder
Args:
- url : a str containing a full url to an image
- folder : Name of a folder in /Images
Returns:
- True : The image is already in the folder
- False : The image is not in the folder
'''
image_name = url.split("/")[-1]
image_name = image_name.split("?")[0]
files = listdir(f"/Images/{folder}")
for i in files:
if i == image_name:
return True
return False
args = getArgs()