Ajout threading

2021-02-07 23:25:40 +01:00
parent bd16dc7a6e
commit f7dcea7259
1 changed files with 55 additions and 7 deletions
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -8,6 +8,8 @@ import mechanicalsoup
 import wget
 from os import mkdir, listdir
 from re import findall
+from time import sleep
+from threading import Thread
 '''
 ############## 4chan thread scrapper ################
 here we look for particular threads on 4chan and dl their images
@ -26,6 +28,8 @@ def getArgs():
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
+    parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)

    #Creating the args object
    args=parser.parse_args()
@ -114,7 +118,7 @@ def item_dl(sources, dlfolder):
    dir_content = listdir(dlfolder)
    for index,source in enumerate(sources):
        imagename = findall(r"[^\/]*$", source)[0]
-        if imagename in dir_content:
+        if imagename or f"{imagename} (01).jpg" in dir_content:
            sources.pop(index)
            print(f"Found duplicate {imagename}")

@ -123,11 +127,44 @@ def item_dl(sources, dlfolder):
        name = wget.download(fullsource, out=dlfolder)
        print(f"{name} downloaded")

+    return True
+
+def constant_dl(folder, url):
+    '''
+    Constantly download...
+    Args: 
+    - folder: folder to dl into
+    - url : board to watch
+    '''
+
+    sleep(2)
+    soup = html_get(url)
+    hrefs = thread_finder(soup, keyword)
+    sources = scraper(soup)
+    #item_dl(sources, folder)
+
+    #Dling all threads found
+
+    #oneshot
+    for href in hrefs:
+        print(f"going after {url}{href}")
+        subsoup = html_get(f"{url}{href}")
+        subsources = scraper(subsoup)
+        print(subsources)
+        item_dl(subsources, folder)
+
+
+

 #Getting main elements
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
+if args.threads:
+    threadnumber = args.threads
+else:
+    threadnumber = 2
+
 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
@ -136,9 +173,20 @@ sources = scraper(soup)

 #Dling all threads found

-for href in hrefs:
+#oneshot
+if not args.constant:
+    for href in hrefs:
        print(f"going after {url}{href}")
        subsoup = html_get(f"{url}{href}")
        subsources = scraper(subsoup)
        print(subsources)
        item_dl(subsources, folder)
+else:
+    thread_objects = []
+    for i in range (1, threadnumber):
+        thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
+    for thread in thread_objects:
+        thread.start()
+
+#constant
+