diff --git a/4chanthreadfinder.py b/4chanthreadfinder.py index 90d3a94..fbb6cd9 100755 --- a/4chanthreadfinder.py +++ b/4chanthreadfinder.py @@ -8,6 +8,8 @@ import mechanicalsoup import wget from os import mkdir, listdir from re import findall +from time import sleep +from threading import Thread ''' ############## 4chan thread scrapper ################ here we look for particular threads on 4chan and dl their images @@ -26,6 +28,8 @@ def getArgs(): parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True) parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True) parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True) + parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true") + parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False) #Creating the args object args=parser.parse_args() @@ -114,7 +118,7 @@ def item_dl(sources, dlfolder): dir_content = listdir(dlfolder) for index,source in enumerate(sources): imagename = findall(r"[^\/]*$", source)[0] - if imagename in dir_content: + if imagename or f"{imagename} (01).jpg" in dir_content: sources.pop(index) print(f"Found duplicate {imagename}") @@ -123,11 +127,44 @@ def item_dl(sources, dlfolder): name = wget.download(fullsource, out=dlfolder) print(f"{name} downloaded") + return True + +def constant_dl(folder, url): + ''' + Constantly download... + Args: + - folder: folder to dl into + - url : board to watch + ''' + + sleep(2) + soup = html_get(url) + hrefs = thread_finder(soup, keyword) + sources = scraper(soup) + #item_dl(sources, folder) + + #Dling all threads found + + #oneshot + for href in hrefs: + print(f"going after {url}{href}") + subsoup = html_get(f"{url}{href}") + subsources = scraper(subsoup) + print(subsources) + item_dl(subsources, folder) + + + #Getting main elements args = getArgs() folder = args.folder keyword = args.keyword +if args.threads: + threadnumber = args.threads +else: + threadnumber = 2 + url = args.url soup = html_get(url) hrefs = thread_finder(soup, keyword) @@ -136,9 +173,20 @@ sources = scraper(soup) #Dling all threads found -for href in hrefs: - print(f"going after {url}{href}") - subsoup = html_get(f"{url}{href}") - subsources = scraper(subsoup) - print(subsources) - item_dl(subsources, folder) +#oneshot +if not args.constant: + for href in hrefs: + print(f"going after {url}{href}") + subsoup = html_get(f"{url}{href}") + subsources = scraper(subsoup) + print(subsources) + item_dl(subsources, folder) +else: + thread_objects = [] + for i in range (1, threadnumber): + thread_objects.append(Thread(target=constant_dl, args=(folder, url))) + for thread in thread_objects: + thread.start() + +#constant +