Compare commits

...

7 Commits

Author SHA1 Message Date
ca78d58458 Update 'scrape.py'
Fixed dedupe
2021-08-12 00:54:29 +00:00
b2e7deb058 Update 'scrape.py'
Added dedup
2021-08-12 00:26:36 +00:00
e06953ccf6 Update '4chanthreadfinder.py'
Change loglevel to INFO as not be flooded by mechanicalSoup
2021-02-19 11:00:15 +00:00
77c20d67f1 Update '4chanthreadfinder.py'
Added logging
2021-02-19 10:52:22 +00:00
52fdd4f4b1 Update '4chanthreadfinder.py'
Added multiple keywords support
2021-02-10 14:26:26 +00:00
a97067d452 Update '4chanthreadfinder.py'
Bugfixes
2021-02-09 20:19:48 +00:00
aa67b222d8 menage 2021-02-09 21:49:12 +01:00
14 changed files with 91 additions and 28 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 376 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 920 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

View File

@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
import mechanicalsoup import mechanicalsoup
import wget import wget
from os import mkdir, listdir from os import mkdir, listdir
from sys import path
from re import findall from re import findall
from time import sleep from time import sleep
from threading import Thread from threading import Thread
import logging
''' '''
############## 4chan thread scrapper ################ ############## 4chan thread scrapper ################
@ -38,7 +40,8 @@ def getArgs():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true") parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
#Creating the args object #Creating the args object
@ -51,7 +54,7 @@ def thread_finder(homepage, keyword):
returns a list of all the threads where op said keyword on 4chan homepage returns a list of all the threads where op said keyword on 4chan homepage
Args: Args:
- homepage: bs4 soup object containing html from the homepage of the board - homepage: bs4 soup object containing html from the homepage of the board
- keyword : any single word - keyword : list of expressions to look for
Returns: Returns:
- hrefs : all the references to matching threads. They must be appended to homepage to work - hrefs : all the references to matching threads. They must be appended to homepage to work
''' '''
@ -62,11 +65,13 @@ def thread_finder(homepage, keyword):
for thread in threads: for thread in threads:
texts = thread.findAll('blockquote', {'class' : 'postMessage'}) texts = thread.findAll('blockquote', {'class' : 'postMessage'})
for text in texts: for text in texts:
if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower(): for word in keyword:
links = thread.findAll('a', {'title': 'Reply to this post'}) if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
for link in links: print(f"Found {word}")
hrefs.append(f"{link['href']}") links = thread.findAll('a', {'title': 'Reply to this post'})
return hrefs for link in links:
hrefs.append(f"{link['href']}")
return hrefs
@ -83,7 +88,6 @@ def html_get(url):
browser = mechanicalsoup.Browser() browser = mechanicalsoup.Browser()
try: try:
page = browser.get(url) page = browser.get(url)
print(f"Got html from {url}")
return page.soup return page.soup
except Exception as e: except Exception as e:
print(f"Got error {e}") print(f"Got error {e}")
@ -122,12 +126,6 @@ def item_dl(sources, dlfolder):
global folder_content global folder_content
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
#Deduplicating #Deduplicating
imagenames = [] imagenames = []
@ -136,10 +134,25 @@ def item_dl(sources, dlfolder):
imagename = findall(r"[^\/]*$", source)[0] imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in folder_content: if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder) name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded") print(f"{name} downloaded from {source}")
return True return True
def folder_create(dlfolder):
'''
Create the folder if it does not exist
Args:
- dlfolder : path of folder to create
'''
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
def folder_watch(folder): def folder_watch(folder):
''' '''
Watch for the content of a folder and return its content. Watch for the content of a folder and return its content.
@ -160,12 +173,13 @@ def folder_watch(folder):
folder_content += i folder_content += i
def dl_threads(folder, url): def dl_threads(folder, url, log_enabled):
''' '''
Constantly download... Constantly download...
Args: Args:
- folder: folder to dl into - folder: folder to dl into
- url : board to watch - url : board to watch
- log_enabled : Set True if logging lib is used
''' '''
try: try:
@ -179,13 +193,21 @@ def dl_threads(folder, url):
#oneshot #oneshot
for href in hrefs: for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}") subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup) subsources = scraper(subsoup)
folder_watch(folder) folder_watch(folder)
item_dl(subsources, folder) try:
item_dl(subsources, folder)
if log_enabled:
logging.info(f"Downloaded {url}{href}")
sleep(2)
except HTTPSConnectionPool as ConnErr:
if log_enabled:
logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
else:
print(f"Got Error {ConErr}, pipes must be clogged lulz")
except Exception as e: except Exception as e:
print(f"Houston, we had a problem: \n{e}") print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
@ -193,12 +215,29 @@ def dl_threads(folder, url):
args = getArgs() args = getArgs()
folder = args.folder folder = args.folder
keyword = args.keyword keyword = args.keyword
if args.logfile:
logfile = args.logfile
#Creating Logfile
logging.basicConfig(\
format='%(asctime)s %(levelname)-8s %(message)s',\
filename=f"{path[0]}/{args.logfile}",\
level = logging.INFO,\
datefmt='%Y-%m-%d %H:%M:%S'\
)
log_enabled = True
else:
log_enabled = False
url = args.url url = args.url
soup = html_get(url) soup = html_get(url)
hrefs = thread_finder(soup, keyword) hrefs = thread_finder(soup, keyword)
sources = scraper(soup) sources = scraper(soup)
folder_create(folder)
folder_content = "" folder_content = ""
print("Lurking...")
#item_dl(sources, folder) #item_dl(sources, folder)
#Dling all threads found #Dling all threads found
@ -207,10 +246,10 @@ folder_content = ""
if not args.constant: if not args.constant:
for href in hrefs: for href in hrefs:
folder_watch(folder) folder_watch(folder)
dl_threads(folder_url) dl_threads(folder, url, log_enabled)
else: else:
while True: while True:
folder_watch(folder) folder_watch(folder)
dl_threads(folder, url) dl_threads(folder, url)
sleep(60)
print('Sayonara')

View File

@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
#Connexion web #Connexion web
import mechanicalsoup import mechanicalsoup
import wget import wget
from os import mkdir from os import mkdir, listdir
''' '''
############## 4chan thread scrapper ################ ############## 4chan thread scrapper ################
Give me a page, i'll download all of its pictures ! Give me a 4chan thread, i'll download all of its pictures !
''' '''
@ -57,10 +57,10 @@ def scraper(soup):
Returns: Returns:
- sources : A list of image sources - sources : A list of image sources
''' '''
tags=soup.findAll('img') tags=soup.findAll('a', {"class": "fileThumb"})
sources = [] sources = []
for tag in tags: for tag in tags:
sources.append(tag['src']) sources.append(tag['href'])
#Got image sources, removing any left http headers #Got image sources, removing any left http headers
for index, item in enumerate(sources): for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
for source in sources: for source in sources:
fullsource = "http://" + source fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder) if dlfolder[-1] == "/":
print(f"{name} downloaded") dlfolder = dlfolder[:-1]
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
else:
print(f"{source} is already there")
def deduplicate(url, folder):
'''
Takes a url to an image and a folder, check if said image exists in said folder
Args:
- url : a str containing a full url to an image
- folder : Name of a folder in /Images
Returns:
- True : The image is already in the folder
- False : The image is not in the folder
'''
image_name = url.split("/")[-1]
image_name = image_name.split("?")[0]
files = listdir(f"/Images/{folder}")
for i in files:
if i == image_name:
return True
return False
args = getArgs() args = getArgs()