Compare commits

...

4 Commits

Author SHA1 Message Date
Justine
5f31ab9fcc Plus de threads, trop complexe + ajout try sur download 2021-02-08 17:39:49 +01:00
Justine
913208274b Plus de threads, trop complexe 2021-02-08 17:38:31 +01:00
Justine
85a798b311 Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper 2021-02-08 17:04:30 +01:00
Justine
2a5eb235e4 Ajout boucle try sur les dls 2021-02-08 17:04:12 +01:00
13 changed files with 53 additions and 28 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 248 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 344 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 376 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 408 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 920 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

View File

@ -10,12 +10,23 @@ from os import mkdir, listdir
from re import findall
from time import sleep
from threading import Thread
'''
############## 4chan thread scrapper ################
here we look for particular threads on 4chan and dl their images
This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
. \\
,` ( ` SquiP
( \' "
`-.__)_
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
@ -29,7 +40,6 @@ def getArgs():
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
#Creating the args object
args=parser.parse_args()
@ -105,35 +115,60 @@ def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder, which we try to create"
Args:
- sources : a list of URLs
- sources : a list of URLsi
- global folder_content : see folder_watch()
'''
#Making folder
global folder_content
try:
#Making folder
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
#Deduplicating
imagenames = []
dir_content = listdir(dlfolder)
for source in sources:
fullsource = "http://" + source
imagename = findall(r"[^\/]*$", source)[0]
if imagename[:-4] not in str(dir_content):
if imagename[:-4] not in folder_content:
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
return True
def constant_dl(folder, url):
def folder_watch(folder):
'''
Watch for the content of a folder and return its content.
Content is a string containing all the names of all the elements.
Args:
- folder : folder to watch
- global folder_content : see folder_watch()
Returns:
folder_content : said string, containing all the names of all the files in the folder
'''
global folder_content
folder_list = listdir(folder)
folder_content = ""
for i in folder_list:
folder_content += i
def dl_threads(folder, url):
'''
Constantly download...
Args:
- folder: folder to dl into
- url : board to watch
'''
while True:
try:
sleep(2)
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
@ -147,9 +182,10 @@ def constant_dl(folder, url):
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
print(subsources)
folder_watch(folder)
item_dl(subsources, folder)
except Exception as e:
print(f"Houston, we had a problem: \n{e}")
@ -157,15 +193,12 @@ def constant_dl(folder, url):
args = getArgs()
folder = args.folder
keyword = args.keyword
if args.threads:
threadnumber = args.threads
else:
threadnumber = 2
url = args.url
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
folder_content = ""
#item_dl(sources, folder)
#Dling all threads found
@ -173,16 +206,11 @@ sources = scraper(soup)
#oneshot
if not args.constant:
for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
folder_watch(folder)
dl_threads(folder_url)
else:
thread_objects = []
for i in range (1, threadnumber):
thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
for thread in thread_objects:
thread.start()
while True:
folder_watch(folder)
dl_threads(folder, url)

View File

@ -46,11 +46,8 @@ Use (constant, multi-threaded):
* -f : folder where you want to download all pictures
* -k : keyword or keyphrase to search (better use a single word !)
* -c : constant : enables constant downloading
* -t 3 : number of threads. Here, 3 threads keep on running to constantly download
##Todo
* Filter by filetype
* Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
* Use a try / catch when dling since some threads go 404 and it gives us a crash
* Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)