Ajout threading

This commit is contained in:
Justine 2021-02-07 23:25:40 +01:00
parent bd16dc7a6e
commit f7dcea7259

View File

@ -8,6 +8,8 @@ import mechanicalsoup
import wget import wget
from os import mkdir, listdir from os import mkdir, listdir
from re import findall from re import findall
from time import sleep
from threading import Thread
''' '''
############## 4chan thread scrapper ################ ############## 4chan thread scrapper ################
here we look for particular threads on 4chan and dl their images here we look for particular threads on 4chan and dl their images
@ -26,6 +28,8 @@ def getArgs():
parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
#Creating the args object #Creating the args object
args=parser.parse_args() args=parser.parse_args()
@ -114,7 +118,7 @@ def item_dl(sources, dlfolder):
dir_content = listdir(dlfolder) dir_content = listdir(dlfolder)
for index,source in enumerate(sources): for index,source in enumerate(sources):
imagename = findall(r"[^\/]*$", source)[0] imagename = findall(r"[^\/]*$", source)[0]
if imagename in dir_content: if imagename or f"{imagename} (01).jpg" in dir_content:
sources.pop(index) sources.pop(index)
print(f"Found duplicate {imagename}") print(f"Found duplicate {imagename}")
@ -123,11 +127,44 @@ def item_dl(sources, dlfolder):
name = wget.download(fullsource, out=dlfolder) name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded") print(f"{name} downloaded")
return True
def constant_dl(folder, url):
'''
Constantly download...
Args:
- folder: folder to dl into
- url : board to watch
'''
sleep(2)
soup = html_get(url)
hrefs = thread_finder(soup, keyword)
sources = scraper(soup)
#item_dl(sources, folder)
#Dling all threads found
#oneshot
for href in hrefs:
print(f"going after {url}{href}")
subsoup = html_get(f"{url}{href}")
subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
#Getting main elements #Getting main elements
args = getArgs() args = getArgs()
folder = args.folder folder = args.folder
keyword = args.keyword keyword = args.keyword
if args.threads:
threadnumber = args.threads
else:
threadnumber = 2
url = args.url url = args.url
soup = html_get(url) soup = html_get(url)
hrefs = thread_finder(soup, keyword) hrefs = thread_finder(soup, keyword)
@ -136,9 +173,20 @@ sources = scraper(soup)
#Dling all threads found #Dling all threads found
for href in hrefs: #oneshot
print(f"going after {url}{href}") if not args.constant:
subsoup = html_get(f"{url}{href}") for href in hrefs:
subsources = scraper(subsoup) print(f"going after {url}{href}")
print(subsources) subsoup = html_get(f"{url}{href}")
item_dl(subsources, folder) subsources = scraper(subsoup)
print(subsources)
item_dl(subsources, folder)
else:
thread_objects = []
for i in range (1, threadnumber):
thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
for thread in thread_objects:
thread.start()
#constant