Update 'scrape.py'

Fixed dedupe
Update 'scrape.py'
2021-08-12 00:54:29 +00:00 · 2021-08-12 00:26:36 +00:00 · 2021-02-19 11:00:15 +00:00 · 2021-02-19 10:52:22 +00:00 · 2021-02-10 14:26:26 +00:00 · 2021-02-09 20:19:48 +00:00
4 changed files with 138 additions and 50 deletions
--- a/1612736548910.pngyc4rlphz.tmp
+++ b/1612736548910.pngyc4rlphz.tmp
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -7,15 +7,28 @@ from bs4 import BeautifulSoup
 import mechanicalsoup
 import wget
 from os import mkdir, listdir
+from sys import path
 from re import findall
 from time import sleep
 from threading import Thread
+import logging
+
 '''
 ############## 4chan thread scrapper ################
-here we look for particular threads on 4chan and dl their images
+This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
+Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
+
+
+   .  \\
+   ,` ( `   SquiP
+  (  \' "
+   `-.__)_ 
+
 '''


+
+
 def getArgs():
    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
    No args
@ -27,9 +40,9 @@ def getArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
-    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
+    parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
-    parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)

    #Creating the args object
    args=parser.parse_args()
@ -41,7 +54,7 @@ def thread_finder(homepage, keyword):
    returns a list of all the threads where op said keyword on 4chan homepage
    Args:
    - homepage: bs4 soup object containing html from the homepage of the board
-    - keyword : any single word
+    - keyword : list of expressions to look for
    Returns:
    - hrefs : all the references to matching threads. They must be appended to homepage to work
    '''
@ -52,11 +65,13 @@ def thread_finder(homepage, keyword):
    for thread in threads:
        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
        for text in texts:
-            if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
-                links = thread.findAll('a', {'title': 'Reply to this post'})
-                for link in links:
-                    hrefs.append(f"{link['href']}")
-    return hrefs
+            for word in keyword:
+                if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
+                    print(f"Found {word}")
+                    links = thread.findAll('a', {'title': 'Reply to this post'})
+                    for link in links:
+                        hrefs.append(f"{link['href']}")
+        return hrefs
    


@ -73,7 +88,6 @@ def html_get(url):
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
-        print(f"Got html from {url}")
        return page.soup
    except Exception as e:
        print(f"Got error {e}")
@ -105,35 +119,70 @@ def item_dl(sources, dlfolder):
    '''
    Download all items in the sources list to folder dlfolder, which we try to create"
    Args:
-    - sources : a list of URLs
+    - sources : a list of URLsi
+    - global folder_content : see folder_watch()
+
    '''
-    #Making folder
-    try:
-        mkdir(dlfolder)
-    except FileExistsError:
-        print(f"{dlfolder} already exists, not creating")
+    
+    global folder_content
    
    #Deduplicating
    imagenames = []
-    dir_content = listdir(dlfolder)

    for source in sources:
        fullsource = "http://" + source
        imagename = findall(r"[^\/]*$", source)[0]
-        if imagename[:-4] not in str(dir_content):
+        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
-            print(f"{name} downloaded")
+            print(f"{name} downloaded from {source}")

    return True

-def constant_dl(folder, url):
+def folder_create(dlfolder):
+    '''
+    Create the folder if it does not exist
+    Args:
+    - dlfolder : path of folder to create
+    '''
+
+    try:
+    #Making folder
+        mkdir(dlfolder)
+    except FileExistsError:
+        print(f"{dlfolder} already exists, not creating")
+
+
+
+def folder_watch(folder):
+    '''
+    Watch for the content of a folder and return its content.
+    Content is a string containing all the names of all the elements.
+    Args:
+    - folder : folder to watch
+    - global folder_content : see folder_watch()
+    Returns:
+    folder_content : said string, containing all the names of all the files in the folder
+    '''
+
+    global folder_content
+
+    folder_list = listdir(folder)
+    folder_content = "" 
+
+    for i in folder_list:
+        folder_content += i
+
+
+def dl_threads(folder, url, log_enabled):
    '''
    Constantly download...
    Args: 
    - folder: folder to dl into
    - url : board to watch
+    - log_enabled : Set True if logging lib is used
    '''
-    while True:
+
+    try:
        sleep(2)
        soup = html_get(url)
        hrefs = thread_finder(soup, keyword)
@ -144,12 +193,21 @@ def constant_dl(folder, url):

        #oneshot
        for href in hrefs:
-            print(f"going after {url}{href}")
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
-            print(subsources)
-            item_dl(subsources, folder)
-
+            folder_watch(folder)
+            try:
+                item_dl(subsources, folder)
+                if log_enabled:
+                    logging.info(f"Downloaded {url}{href}") 
+                sleep(2)
+            except HTTPSConnectionPool as ConnErr:
+                if log_enabled:
+                    logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
+                else:
+                    print(f"Got Error {ConErr}, pipes must be clogged lulz")
+    except Exception as e:
+        print(f"Houston, we had a problem with {url} and {folder}: \n{e}")



@ -157,15 +215,29 @@ def constant_dl(folder, url):
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
-if args.threads:
-    threadnumber = args.threads
+if args.logfile:
+    logfile = args.logfile
+
+    #Creating Logfile
+    logging.basicConfig(\
+            format='%(asctime)s %(levelname)-8s %(message)s',\
+            filename=f"{path[0]}/{args.logfile}",\
+            level = logging.INFO,\
+            datefmt='%Y-%m-%d %H:%M:%S'\
+            )
+    log_enabled = True
 else:
-    threadnumber = 2
+    log_enabled = False
+

 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
 sources = scraper(soup)
+folder_create(folder)
+folder_content = ""
+
+print("Lurking...")
 #item_dl(sources, folder)

 #Dling all threads found
@ -173,16 +245,11 @@ sources = scraper(soup)
 #oneshot
 if not args.constant:
    for href in hrefs:
-        print(f"going after {url}{href}")
-        subsoup = html_get(f"{url}{href}")
-        subsources = scraper(subsoup)
-        print(subsources)
-        item_dl(subsources, folder)
+        folder_watch(folder)
+        dl_threads(folder, url, log_enabled)
 else:
-    thread_objects = []
-    for i in range (1, threadnumber):
-        thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
-    for thread in thread_objects:
-        thread.start()
-
-
+    while True:
+            folder_watch(folder)
+            dl_threads(folder, url)
+            sleep(60)            
+print('Sayonara')
--- a/README.md
+++ b/README.md
@ -46,11 +46,8 @@ Use (constant, multi-threaded):
 * -f : folder where you want to download all pictures
 * -k : keyword or keyphrase to search (better use a single word !)
 * -c : constant : enables constant downloading
-* -t 3 : number of threads. Here, 3 threads keep on running to constantly download

 ##Todo
 * Filter by filetype
-* Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
-* Use a try / catch when dling since some threads go 404 and it gives us a crash
 * Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)

--- a/scrape.py
+++ b/scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
 #Connexion web
 import mechanicalsoup
 import wget
-from os import mkdir
+from os import mkdir, listdir
 '''
 ############## 4chan thread scrapper ################
-Give me a page, i'll download all of its pictures !
+Give me a 4chan thread, i'll download all of its pictures !
 '''


@ -57,10 +57,10 @@ def scraper(soup):
    Returns:
    - sources : A list of image sources
    '''
-    tags=soup.findAll('img')
+    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
-        sources.append(tag['src'])
+        sources.append(tag['href'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):

    for source in sources:
        fullsource = "http://" + source
-        name = wget.download(fullsource, out=dlfolder)
-        print(f"{name} downloaded")
+        if dlfolder[-1] == "/":
+            dlfolder = dlfolder[:-1]
+        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
+            name = wget.download(fullsource, out=dlfolder)
+            print(f"{name} downloaded")
+        else:
+            print(f"{source} is already there")
+
+def deduplicate(url, folder):
+    '''
+    Takes a url to an image and a folder, check if said image exists in said folder
+    Args:
+    - url : a str containing a full url to an image
+    - folder : Name of a folder in /Images
+    Returns:
+    - True : The image is already in the folder
+    - False : The image is not in the folder
+    '''
+    image_name = url.split("/")[-1]
+    image_name = image_name.split("?")[0]
+    files = listdir(f"/Images/{folder}")
+
+    for i in files:
+        if i == image_name:
+            return True
+    return False


 args = getArgs()
Author	SHA1	Message	Date
justine	ca78d58458	Update 'scrape.py' Fixed dedupe	2021-08-12 00:54:29 +00:00
justine	b2e7deb058	Update 'scrape.py' Added dedup	2021-08-12 00:26:36 +00:00
justine	e06953ccf6	Update '4chanthreadfinder.py' Change loglevel to INFO as not be flooded by mechanicalSoup	2021-02-19 11:00:15 +00:00
justine	77c20d67f1	Update '4chanthreadfinder.py' Added logging	2021-02-19 10:52:22 +00:00
justine	52fdd4f4b1	Update '4chanthreadfinder.py' Added multiple keywords support	2021-02-10 14:26:26 +00:00
justine	a97067d452	Update '4chanthreadfinder.py' Bugfixes	2021-02-09 20:19:48 +00:00
Justine	aa67b222d8	menage	2021-02-09 21:49:12 +01:00
Justine	5f31ab9fcc	Plus de threads, trop complexe + ajout try sur download	2021-02-08 17:39:49 +01:00
Justine	913208274b	Plus de threads, trop complexe	2021-02-08 17:38:31 +01:00
Justine	85a798b311	Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper	2021-02-08 17:04:30 +01:00
Justine	2a5eb235e4	Ajout boucle try sur les dls	2021-02-08 17:04:12 +01:00