Update 'scrape.py'

Fixed dedupe
Update 'scrape.py'
2021-08-12 00:54:29 +00:00 · 2021-08-12 00:26:36 +00:00 · 2021-02-19 11:00:15 +00:00 · 2021-02-19 10:52:22 +00:00 · 2021-02-10 14:26:26 +00:00 · 2021-02-09 20:19:48 +00:00
4 changed files with 138 additions and 50 deletions
--- a/1612736548910.pngyc4rlphz.tmp
+++ b/1612736548910.pngyc4rlphz.tmp
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -7,15 +7,28 @@ from bs4 import BeautifulSoup
 import mechanicalsoup
 import wget
 from os import mkdir, listdir
 from sys import path
 from re import findall
 from time import sleep
 from threading import Thread
 import logging
 '''
 ############## 4chan thread scrapper ################
-here we look for particular threads on 4chan and dl their images
+This script is deisgned to look for specific words on 4 chan boards, and download all images from relevant thread.
 Usage of a VPN is recommended since 4chan is a shady place. Use at your own risk !
   .  \\
   ,` ( `   SquiP
  (  \' "
   `-.__)_ 
 '''
 def getArgs():
    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
    No args
@ -27,9 +40,9 @@ def getArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
-    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
    parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
    parser.add_argument("-t", "--threads", help = "Number of threads in case of constant run, defaults to 2", action="store", type=int, required=False)
    #Creating the args object
    args=parser.parse_args()
@ -41,7 +54,7 @@ def thread_finder(homepage, keyword):
    returns a list of all the threads where op said keyword on 4chan homepage
    Args:
    - homepage: bs4 soup object containing html from the homepage of the board
-    - keyword : any single word
+    - keyword : list of expressions to look for
    Returns:
    - hrefs : all the references to matching threads. They must be appended to homepage to work
    '''
@ -52,11 +65,13 @@ def thread_finder(homepage, keyword):
    for thread in threads:
        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
        for text in texts:
-            if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
+            for word in keyword:
-                links = thread.findAll('a', {'title': 'Reply to this post'})
+                if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
-                for link in links:
+                    print(f"Found {word}")
-                    hrefs.append(f"{link['href']}")
+                    links = thread.findAll('a', {'title': 'Reply to this post'})
-    return hrefs
+                    for link in links:
                        hrefs.append(f"{link['href']}")
        return hrefs
@ -73,7 +88,6 @@ def html_get(url):
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
        print(f"Got html from {url}")
        return page.soup
    except Exception as e:
        print(f"Got error {e}")
@ -105,35 +119,70 @@ def item_dl(sources, dlfolder):
    '''
    Download all items in the sources list to folder dlfolder, which we try to create"
    Args:
-    - sources : a list of URLs
+    - sources : a list of URLsi
-    '''
+    - global folder_content : see folder_watch()
    #Making folder
    try:
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")
    '''
    global folder_content
    #Deduplicating
    imagenames = []
    dir_content = listdir(dlfolder)
    for source in sources:
        fullsource = "http://" + source
        imagename = findall(r"[^\/]*$", source)[0]
-        if imagename[:-4] not in str(dir_content):
+        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
-            print(f"{name} downloaded")
+            print(f"{name} downloaded from {source}")
    return True
-def constant_dl(folder, url):
+def folder_create(dlfolder):
    '''
    Create the folder if it does not exist
    Args:
    - dlfolder : path of folder to create
    '''
    try:
    #Making folder
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")
 def folder_watch(folder):
    '''
    Watch for the content of a folder and return its content.
    Content is a string containing all the names of all the elements.
    Args:
    - folder : folder to watch
    - global folder_content : see folder_watch()
    Returns:
    folder_content : said string, containing all the names of all the files in the folder
    '''
    global folder_content
    folder_list = listdir(folder)
    folder_content = "" 
    for i in folder_list:
        folder_content += i
 def dl_threads(folder, url, log_enabled):
    '''
    Constantly download...
    Args: 
    - folder: folder to dl into
    - url : board to watch
    - log_enabled : Set True if logging lib is used
    '''
-    while True:
+
    try:
        sleep(2)
        soup = html_get(url)
        hrefs = thread_finder(soup, keyword)
@ -144,12 +193,21 @@ def constant_dl(folder, url):
        #oneshot
        for href in hrefs:
            print(f"going after {url}{href}")
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
-            print(subsources)
+            folder_watch(folder)
-            item_dl(subsources, folder)
+            try:
-
+                item_dl(subsources, folder)
                if log_enabled:
                    logging.info(f"Downloaded {url}{href}") 
                sleep(2)
            except HTTPSConnectionPool as ConnErr:
                if log_enabled:
                    logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
                else:
                    print(f"Got Error {ConErr}, pipes must be clogged lulz")
    except Exception as e:
        print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
@ -157,15 +215,29 @@ def constant_dl(folder, url):
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
-if args.threads:
+if args.logfile:
-    threadnumber = args.threads
+    logfile = args.logfile
    #Creating Logfile
    logging.basicConfig(\
            format='%(asctime)s %(levelname)-8s %(message)s',\
            filename=f"{path[0]}/{args.logfile}",\
            level = logging.INFO,\
            datefmt='%Y-%m-%d %H:%M:%S'\
            )
    log_enabled = True
 else:
-    threadnumber = 2
+    log_enabled = False
 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
 sources = scraper(soup)
 folder_create(folder)
 folder_content = ""
 print("Lurking...")
 #item_dl(sources, folder)
 #Dling all threads found
@ -173,16 +245,11 @@ sources = scraper(soup)
 #oneshot
 if not args.constant:
    for href in hrefs:
-        print(f"going after {url}{href}")
+        folder_watch(folder)
-        subsoup = html_get(f"{url}{href}")
+        dl_threads(folder, url, log_enabled)
        subsources = scraper(subsoup)
        print(subsources)
        item_dl(subsources, folder)
 else:
-    thread_objects = []
+    while True:
-    for i in range (1, threadnumber):
+            folder_watch(folder)
-        thread_objects.append(Thread(target=constant_dl, args=(folder, url)))
+            dl_threads(folder, url)
-    for thread in thread_objects:
+            sleep(60)            
-        thread.start()
+print('Sayonara')
--- a/README.md
+++ b/README.md
@ -46,11 +46,8 @@ Use (constant, multi-threaded):
 * -f : folder where you want to download all pictures
 * -k : keyword or keyphrase to search (better use a single word !)
 * -c : constant : enables constant downloading
 * -t 3 : number of threads. Here, 3 threads keep on running to constantly download
 ##Todo
 * Filter by filetype
 * Multi-threaded not really working, -t 2 gives one thread and many threads will cause duplicates
 * Use a try / catch when dling since some threads go 404 and it gives us a crash
 * Make a pretty website with some keywords running in the bg, making for some nice public folders (wallpapers...)
--- a/scrape.py
+++ b/scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
 #Connexion web
 import mechanicalsoup
 import wget
-from os import mkdir
+from os import mkdir, listdir
 '''
 ############## 4chan thread scrapper ################
-Give me a page, i'll download all of its pictures !
+Give me a 4chan thread, i'll download all of its pictures !
 '''
@ -57,10 +57,10 @@ def scraper(soup):
    Returns:
    - sources : A list of image sources
    '''
-    tags=soup.findAll('img')
+    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
-        sources.append(tag['src'])
+        sources.append(tag['href'])
    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
    for source in sources:
        fullsource = "http://" + source
-        name = wget.download(fullsource, out=dlfolder)
+        if dlfolder[-1] == "/":
-        print(f"{name} downloaded")
+            dlfolder = dlfolder[:-1]
        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
            name = wget.download(fullsource, out=dlfolder)
            print(f"{name} downloaded")
        else:
            print(f"{source} is already there")
 def deduplicate(url, folder):
    '''
    Takes a url to an image and a folder, check if said image exists in said folder
    Args:
    - url : a str containing a full url to an image
    - folder : Name of a folder in /Images
    Returns:
    - True : The image is already in the folder
    - False : The image is not in the folder
    '''
    image_name = url.split("/")[-1]
    image_name = image_name.split("?")[0]
    files = listdir(f"/Images/{folder}")
    for i in files:
        if i == image_name:
            return True
    return False
 args = getArgs()
Author	SHA1	Message	Date
justine	ca78d58458	Update 'scrape.py' Fixed dedupe	2021-08-12 00:54:29 +00:00
justine	b2e7deb058	Update 'scrape.py' Added dedup	2021-08-12 00:26:36 +00:00
justine	e06953ccf6	Update '4chanthreadfinder.py' Change loglevel to INFO as not be flooded by mechanicalSoup	2021-02-19 11:00:15 +00:00
justine	77c20d67f1	Update '4chanthreadfinder.py' Added logging	2021-02-19 10:52:22 +00:00
justine	52fdd4f4b1	Update '4chanthreadfinder.py' Added multiple keywords support	2021-02-10 14:26:26 +00:00
justine	a97067d452	Update '4chanthreadfinder.py' Bugfixes	2021-02-09 20:19:48 +00:00
Justine	aa67b222d8	menage	2021-02-09 21:49:12 +01:00
Justine	5f31ab9fcc	Plus de threads, trop complexe + ajout try sur download	2021-02-08 17:39:49 +01:00
Justine	913208274b	Plus de threads, trop complexe	2021-02-08 17:38:31 +01:00
Justine	85a798b311	Merge branch 'master' of https://gitea.squirrelsystem.fr/justine/ImageScrapper	2021-02-08 17:04:30 +01:00
Justine	2a5eb235e4	Ajout boucle try sur les dls	2021-02-08 17:04:12 +01:00