Compare commits

...

4 Commits

Author SHA1 Message Date
Justine
5f31ab9fcc Plus de threads, trop complexe + ajout try sur download 2021-02-08 17:39:49 +01:00
Justine
913208274b Plus de threads, trop complexe 2021-02-08 17:38:31 +01:00
Justine
85a798b311 Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper 2021-02-08 17:04:30 +01:00
Justine
2a5eb235e4 Ajout boucle try sur les dls 2021-02-08 17:04:12 +01:00
13 changed files with 53 additions and 28 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 376 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 920 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

View File

@ -10,12 +10,23 @@ from os import mkdir, listdir
from re import findall from re import findall
from time import sleep from time import sleep
from threading import Thread from threading import Thread
''' '''
############## 4chan thread scrapper ################ ############## 4chan thread scrapper ################
here we look for particular threads on 4chan and dl their images This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
. \\
,` ( ` SquiP
( \' "
`-.__)_
''' '''
def getArgs(): def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object. '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args No args
@ -29,7 +40,6 @@ def getArgs():
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true") parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
#Creating the args object #Creating the args object
args=parser.parse_args() args=parser.parse_args()
@ -105,35 +115,60 @@ def item_dl(sources, dlfolder):
''' '''
Download all items in the sources list to folder dlfolder, which we try to create" Download all items in the sources list to folder dlfolder, which we try to create"
Args: Args:
- sources : a list of URLs - sources : a list of URLsi
- global folder_content : see folder_watch()
''' '''
#Making folder
global folder_content
try: try:
#Making folder
mkdir(dlfolder) mkdir(dlfolder)
except FileExistsError: except FileExistsError:
print(f"{dlfolder} already exists, not creating") print(f"{dlfolder} already exists, not creating")
#Deduplicating #Deduplicating
imagenames = [] imagenames = []
dir_content = listdir(dlfolder)
for source in sources: for source in sources:
fullsource = "http://" + source fullsource = "http://" + source
imagename = findall(r"[^\/]*$", source)[0] imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in str(dir_content): if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder) name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded") print(f"{name} downloaded")
return True return True
def constant_dl(folder, url): def folder_watch(folder):
'''
Watch for the content of a folder and return its content.
Content is a string containing all the names of all the elements.
Args:
- folder : folder to watch
- global folder_content : see folder_watch()
Returns:
folder_content : said string, containing all the names of all the files in the folder
'''
global folder_content
folder_list = listdir(folder)
folder_content = ""
for i in folder_list:
folder_content += i
def dl_threads(folder, url):
''' '''
Constantly download... Constantly download...
Args: Args:
- folder: folder to dl into - folder: folder to dl into
- url : board to watch - url : board to watch
''' '''
while True:
try:
sleep(2) sleep(2)
soup = html_get(url) soup = html_get(url)
hrefs = thread_finder(soup, keyword) hrefs = thread_finder(soup, keyword)
@ -147,9 +182,10 @@ def constant_dl(folder, url):
print(f"going after {url}{href}") print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}") subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup) subsources = scraper(subsoup)
print(subsources) folder_watch(folder)
item_dl(subsources, folder) item_dl(subsources, folder)
except Exception as e:
print(f"Houston, we had a problem: \n{e}")
@ -157,15 +193,12 @@ def constant_dl(folder, url):
args = getArgs() args = getArgs()
folder = args.folder folder = args.folder
keyword = args.keyword keyword = args.keyword
if args.threads:
threadnumber = args.threads
else:
threadnumber = 2
url = args.url url = args.url
soup = html_get(url) soup = html_get(url)
hrefs = thread_finder(soup, keyword) hrefs = thread_finder(soup, keyword)
sources = scraper(soup) sources = scraper(soup)
folder_content = ""
#item_dl(sources, folder) #item_dl(sources, folder)
#Dling all threads found #Dling all threads found
@ -173,16 +206,11 @@ sources = scraper(soup)
#oneshot #oneshot
if not args.constant: if not args.constant:
for href in hrefs: for href in hrefs:
print(f"going after {url}{href}") folder_watch(folder)
subsoup = html_get(f"{url}{href}") dl_threads(folder_url)
subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
else: else:
thread_objects = [] while True:
for i in range (1, threadnumber): folder_watch(folder)
thread_objects.append(Thread(target=constant_dl, args=(folder, url))) dl_threads(folder, url)
for thread in thread_objects:
thread.start()

View File

@ -46,11 +46,8 @@ Use (constant, multi-threaded):
* -f : folder where you want to download all pictures * -f : folder where you want to download all pictures
* -k : keyword or keyphrase to search (better use a single word !) * -k : keyword or keyphrase to search (better use a single word !)
* -c : constant : enables constant downloading * -c : constant : enables constant downloading
* -t 3 : number of threads. Here, 3 threads keep on running to constantly download
##Todo ##Todo
* Filter by filetype * Filter by filetype
* Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
* Use a try / catch when dling since some threads go 404 and it gives us a crash
* Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...) * Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)