Plus de threads, trop complexe + ajout try sur download

Plus de threads, trop complexe
Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper
2021-02-08 17:39:49 +01:00 · 2021-02-08 17:38:31 +01:00 · 2021-02-08 17:04:30 +01:00 · 2021-02-08 17:04:12 +01:00
13 changed files with 53 additions and 28 deletions
--- a/1612800279774.jpgg1ir4h9g.tmp
+++ b/1612800279774.jpgg1ir4h9g.tmp
--- a/1612800847218.jpgz00i3nd3.tmp
+++ b/1612800847218.jpgz00i3nd3.tmp
--- a/1612801102502.jpg7bewagr3.tmp
+++ b/1612801102502.jpg7bewagr3.tmp
--- a/1612801206104.jpgn8w76bsu.tmp
+++ b/1612801206104.jpgn8w76bsu.tmp
--- a/1612801259880.jpg66nlexsl.tmp
+++ b/1612801259880.jpg66nlexsl.tmp
--- a/1612801259880.jpg6__c8xd4.tmp
+++ b/1612801259880.jpg6__c8xd4.tmp
--- a/1612801442460.jpguqt7rcl2.tmp
+++ b/1612801442460.jpguqt7rcl2.tmp
--- a/1612801753931.jpgebpmo83j.tmp
+++ b/1612801753931.jpgebpmo83j.tmp
--- a/1612801753931.jpgr25ymlkk.tmp
+++ b/1612801753931.jpgr25ymlkk.tmp
--- a/1612801818460.jpgyn5eonf9.tmp
+++ b/1612801818460.jpgyn5eonf9.tmp
--- a/1612801951857.jpgm0h3hv2r.tmp
+++ b/1612801951857.jpgm0h3hv2r.tmp
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -10,12 +10,23 @@ from os import mkdir, listdir
 from re import findall
 from time import sleep
 from threading import Thread
 '''
 ############## 4chan thread scrapper ################
-here we look for particular threads on 4chan and dl their images
+This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
 Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
   .  \\
   ,` ( `   SquiP
  (  \' "
   `-.__)_ 
 '''
 def getArgs():
    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
    No args
@ -29,7 +40,6 @@ def getArgs():
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
    parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
    #Creating the args object
    args=parser.parse_args()
@ -105,35 +115,60 @@ def item_dl(sources, dlfolder):
    '''
    Download all items in the sources list to folder dlfolder, which we try to create"
    Args:
-    - sources : a list of URLs
+    - sources : a list of URLsi
    - global folder_content : see folder_watch()
    '''
-    #Making folder
+    
    global folder_content
    try:
    #Making folder
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")
    #Deduplicating
    imagenames = []
    dir_content = listdir(dlfolder)
    for source in sources:
        fullsource = "http://" + source
        imagename = findall(r"[^\/]*$", source)[0]
-        if imagename[:-4] not in str(dir_content):
+        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
            print(f"{name} downloaded")
    return True
-def constant_dl(folder, url):
+def folder_watch(folder):
    '''
    Watch for the content of a folder and return its content.
    Content is a string containing all the names of all the elements.
    Args:
    - folder : folder to watch
    - global folder_content : see folder_watch()
    Returns:
    folder_content : said string, containing all the names of all the files in the folder
    '''
    global folder_content
    folder_list = listdir(folder)
    folder_content = "" 
    for i in folder_list:
        folder_content += i
 def dl_threads(folder, url):
    '''
    Constantly download...
    Args: 
    - folder: folder to dl into
    - url : board to watch
    '''
-    while True:
+
    try:
        sleep(2)
        soup = html_get(url)
        hrefs = thread_finder(soup, keyword)
@ -147,9 +182,10 @@ def constant_dl(folder, url):
            print(f"going after {url}{href}")
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
-            print(subsources)
+            folder_watch(folder)
            item_dl(subsources, folder)
-
+    except Exception as e:
        print(f"Houston, we had a problem: \n{e}")
@ -157,15 +193,12 @@ def constant_dl(folder, url):
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
 if args.threads:
    threadnumber = args.threads
 else:
    threadnumber = 2
 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
 sources = scraper(soup)
 folder_content = ""
 #item_dl(sources, folder)
 #Dling all threads found
@ -173,16 +206,11 @@ sources = scraper(soup)
 #oneshot
 if not args.constant:
    for href in hrefs:
-        print(f"going after {url}{href}")
+        folder_watch(folder)
-        subsoup = html_get(f"{url}{href}")
+        dl_threads(folder_url)
        subsources = scraper(subsoup)
        print(subsources)
        item_dl(subsources, folder)
 else:
-    thread_objects = []
+    while True:
-    for i in range (1, threadnumber):
+            folder_watch(folder)
-        thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
+            dl_threads(folder, url)
    for thread in thread_objects:
        thread.start()
--- a/README.md
+++ b/README.md
@ -46,11 +46,8 @@ Use (constant, multi-threaded):
 * -f : folder where you want to download all pictures
 * -k : keyword or keyphrase to search (better use a single word !)
 * -c : constant : enables constant downloading
 * -t 3 : number of threads. Here, 3 threads keep on running to constantly download
 ##Todo
 * Filter by filetype
 * Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
 * Use a try / catch when dling since some threads go 404 and it gives us a crash
 * Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)
Author	SHA1	Message	Date
Justine	5f31ab9fcc	Plus de threads, trop complexe + ajout try sur download	2021-02-08 17:39:49 +01:00
Justine	913208274b	Plus de threads, trop complexe	2021-02-08 17:38:31 +01:00
Justine	85a798b311	Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper	2021-02-08 17:04:30 +01:00
Justine	2a5eb235e4	Ajout boucle try sur les dls	2021-02-08 17:04:12 +01:00