Compare commits

..

11 Commits

Author SHA1 Message Date
ca78d58458 Update 'scrape.py'
Fixed dedupe
2021-08-12 00:54:29 +00:00
b2e7deb058 Update 'scrape.py'
Added dedup
2021-08-12 00:26:36 +00:00
e06953ccf6 Update '4chanthreadfinder.py'
Change loglevel to INFO as not be flooded by mechanicalSoup
2021-02-19 11:00:15 +00:00
77c20d67f1 Update '4chanthreadfinder.py'
Added logging
2021-02-19 10:52:22 +00:00
52fdd4f4b1 Update '4chanthreadfinder.py'
Added multiple keywords support
2021-02-10 14:26:26 +00:00
a97067d452 Update '4chanthreadfinder.py'
Bugfixes
2021-02-09 20:19:48 +00:00
aa67b222d8 menage 2021-02-09 21:49:12 +01:00
5f31ab9fcc Plus de threads, trop complexe + ajout try sur download 2021-02-08 17:39:49 +01:00
913208274b Plus de threads, trop complexe 2021-02-08 17:38:31 +01:00
85a798b311 Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper 2021-02-08 17:04:30 +01:00
2a5eb235e4 Ajout boucle try sur les dls 2021-02-08 17:04:12 +01:00
4 changed files with 138 additions and 50 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

View File

@ -7,15 +7,28 @@ from bs4 import BeautifulSoup
import mechanicalsoup
import wget
from os import mkdir, listdir
from sys import path
from re import findall
from time import sleep
from threading import Thread
import logging
'''
############## 4chan thread scrapper ################
here we look for particular threads on 4chan and dl their images
This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
. \\
,` ( ` SquiP
( \' "
`-.__)_
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
@ -27,9 +40,9 @@ def getArgs():
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
#Creating the args object
args=parser.parse_args()
@ -41,7 +54,7 @@ def thread_finder(homepage, keyword):
returns a list of all the threads where op said keyword on 4chan homepage
Args:
- homepage: bs4 soup object containing html from the homepage of the board
- keyword : any single word
- keyword : list of expressions to look for
Returns:
- hrefs : all the references to matching threads. They must be appended to homepage to work
'''
@ -52,11 +65,13 @@ def thread_finder(homepage, keyword):
for thread in threads:
texts = thread.findAll('blockquote', {'class' : 'postMessage'})
for text in texts:
if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
links = thread.findAll('a', {'title': 'Reply to this post'})
for link in links:
hrefs.append(f"{link['href']}")
return hrefs
for word in keyword:
if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
print(f"Found {word}")
links = thread.findAll('a', {'title': 'Reply to this post'})
for link in links:
hrefs.append(f"{link['href']}")
return hrefs
@ -73,7 +88,6 @@ def html_get(url):
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
print(f"Got html from {url}")
return page.soup
except Exception as e:
print(f"Got error {e}")
@ -105,35 +119,70 @@ def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder, which we try to create"
Args:
- sources : a list of URLs
'''
#Making folder
try:
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
- sources : a list of URLsi
- global folder_content : see folder_watch()
'''
global folder_content
#Deduplicating
imagenames = []
dir_content = listdir(dlfolder)
for source in sources:
fullsource = "http://" + source
imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in str(dir_content):
if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
print(f"{name} downloaded from {source}")
return True
def constant_dl(folder, url):
def folder_create(dlfolder):
'''
Create the folder if it does not exist
Args:
- dlfolder : path of folder to create
'''
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
def folder_watch(folder):
'''
Watch for the content of a folder and return its content.
Content is a string containing all the names of all the elements.
Args:
- folder : folder to watch
- global folder_content : see folder_watch()
Returns:
folder_content : said string, containing all the names of all the files in the folder
'''
global folder_content
folder_list = listdir(folder)
folder_content = ""
for i in folder_list:
folder_content += i
def dl_threads(folder, url, log_enabled):
'''
Constantly download...
Args:
- folder: folder to dl into
- url : board to watch
- log_enabled : Set True if logging lib is used
'''
while True:
try:
sleep(2)
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
@ -144,12 +193,21 @@ def constant_dl(folder, url):
#oneshot
for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
folder_watch(folder)
try:
item_dl(subsources, folder)
if log_enabled:
logging.info(f"Downloaded {url}{href}")
sleep(2)
except HTTPSConnectionPool as ConnErr:
if log_enabled:
logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
else:
print(f"Got Error {ConErr}, pipes must be clogged lulz")
except Exception as e:
print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
@ -157,15 +215,29 @@ def constant_dl(folder, url):
args = getArgs()
folder = args.folder
keyword = args.keyword
if args.threads:
threadnumber = args.threads
if args.logfile:
logfile = args.logfile
#Creating Logfile
logging.basicConfig(\
format='%(asctime)s %(levelname)-8s %(message)s',\
filename=f"{path[0]}/{args.logfile}",\
level = logging.INFO,\
datefmt='%Y-%m-%d %H:%M:%S'\
)
log_enabled = True
else:
threadnumber = 2
log_enabled = False
url = args.url
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
folder_create(folder)
folder_content = ""
print("Lurking...")
#item_dl(sources, folder)
#Dling all threads found
@ -173,16 +245,11 @@ sources = scraper(soup)
#oneshot
if not args.constant:
for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
folder_watch(folder)
dl_threads(folder, url, log_enabled)
else:
thread_objects = []
for i in range (1, threadnumber):
thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
for thread in thread_objects:
thread.start()
while True:
folder_watch(folder)
dl_threads(folder, url)
sleep(60)
print('Sayonara')

View File

@ -46,11 +46,8 @@ Use (constant, multi-threaded):
* -f : folder where you want to download all pictures
* -k : keyword or keyphrase to search (better use a single word !)
* -c : constant : enables constant downloading
* -t 3 : number of threads. Here, 3 threads keep on running to constantly download
##Todo
* Filter by filetype
* Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
* Use a try / catch when dling since some threads go 404 and it gives us a crash
* Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)

View File

@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir
from os import mkdir, listdir
'''
############## 4chan thread scrapper ################
Give me a page, i'll download all of its pictures !
Give me a 4chan thread, i'll download all of its pictures !
'''
@ -57,10 +57,10 @@ def scraper(soup):
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('img')
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['src'])
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
for source in sources:
fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
if dlfolder[-1] == "/":
dlfolder = dlfolder[:-1]
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
else:
print(f"{source} is already there")
def deduplicate(url, folder):
'''
Takes a url to an image and a folder, check if said image exists in said folder
Args:
- url : a str containing a full url to an image
- folder : Name of a folder in /Images
Returns:
- True : The image is already in the folder
- False : The image is not in the folder
'''
image_name = url.split("/")[-1]
image_name = image_name.split("?")[0]
files = listdir(f"/Images/{folder}")
for i in files:
if i == image_name:
return True
return False
args = getArgs()