Update 'scrape.py'

Fixed dedupe
Update 'scrape.py'
2021-08-12 00:54:29 +00:00 · 2021-08-12 00:26:36 +00:00 · 2021-02-19 11:00:15 +00:00 · 2021-02-19 10:52:22 +00:00 · 2021-02-10 14:26:26 +00:00 · 2021-02-09 20:19:48 +00:00
14 changed files with 91 additions and 28 deletions
--- a/1612736548910.pngyc4rlphz.tmp
+++ b/1612736548910.pngyc4rlphz.tmp
--- a/1612800279774.jpgg1ir4h9g.tmp
+++ b/1612800279774.jpgg1ir4h9g.tmp
--- a/1612800847218.jpgz00i3nd3.tmp
+++ b/1612800847218.jpgz00i3nd3.tmp
--- a/1612801102502.jpg7bewagr3.tmp
+++ b/1612801102502.jpg7bewagr3.tmp
--- a/1612801206104.jpgn8w76bsu.tmp
+++ b/1612801206104.jpgn8w76bsu.tmp
--- a/1612801259880.jpg66nlexsl.tmp
+++ b/1612801259880.jpg66nlexsl.tmp
--- a/1612801259880.jpg6__c8xd4.tmp
+++ b/1612801259880.jpg6__c8xd4.tmp
--- a/1612801442460.jpguqt7rcl2.tmp
+++ b/1612801442460.jpguqt7rcl2.tmp
--- a/1612801753931.jpgebpmo83j.tmp
+++ b/1612801753931.jpgebpmo83j.tmp
--- a/1612801753931.jpgr25ymlkk.tmp
+++ b/1612801753931.jpgr25ymlkk.tmp
--- a/1612801818460.jpgyn5eonf9.tmp
+++ b/1612801818460.jpgyn5eonf9.tmp
--- a/1612801951857.jpgm0h3hv2r.tmp
+++ b/1612801951857.jpgm0h3hv2r.tmp
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
 import mechanicalsoup
 import wget
 from os import mkdir, listdir
 from sys import path
 from re import findall
 from time import sleep
 from threading import Thread
 import logging
 '''
 ############## 4chan thread scrapper ################
@ -38,7 +40,8 @@ def getArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
-    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
    parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")
    #Creating the args object
@ -51,7 +54,7 @@ def thread_finder(homepage, keyword):
    returns a list of all the threads where op said keyword on 4chan homepage
    Args:
    - homepage: bs4 soup object containing html from the homepage of the board
-    - keyword : any single word
+    - keyword : list of expressions to look for
    Returns:
    - hrefs : all the references to matching threads. They must be appended to homepage to work
    '''
@ -62,11 +65,13 @@ def thread_finder(homepage, keyword):
    for thread in threads:
        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
        for text in texts:
-            if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
+            for word in keyword:
-                links = thread.findAll('a', {'title': 'Reply to this post'})
+                if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
-                for link in links:
+                    print(f"Found {word}")
-                    hrefs.append(f"{link['href']}")
+                    links = thread.findAll('a', {'title': 'Reply to this post'})
-    return hrefs
+                    for link in links:
                        hrefs.append(f"{link['href']}")
        return hrefs
@ -83,7 +88,6 @@ def html_get(url):
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
        print(f"Got html from {url}")
        return page.soup
    except Exception as e:
        print(f"Got error {e}")
@ -122,12 +126,6 @@ def item_dl(sources, dlfolder):
    global folder_content
    try:
    #Making folder
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")
    #Deduplicating
    imagenames = []
@ -136,10 +134,25 @@ def item_dl(sources, dlfolder):
        imagename = findall(r"[^\/]*$", source)[0]
        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
-            print(f"{name} downloaded")
+            print(f"{name} downloaded from {source}")
    return True
 def folder_create(dlfolder):
    '''
    Create the folder if it does not exist
    Args:
    - dlfolder : path of folder to create
    '''
    try:
    #Making folder
        mkdir(dlfolder)
    except FileExistsError:
        print(f"{dlfolder} already exists, not creating")
 def folder_watch(folder):
    '''
    Watch for the content of a folder and return its content.
@ -160,12 +173,13 @@ def folder_watch(folder):
        folder_content += i
-def dl_threads(folder, url):
+def dl_threads(folder, url, log_enabled):
    '''
    Constantly download...
    Args: 
    - folder: folder to dl into
    - url : board to watch
    - log_enabled : Set True if logging lib is used
    '''
    try:
@ -179,13 +193,21 @@ def dl_threads(folder, url):
        #oneshot
        for href in hrefs:
            print(f"going after {url}{href}")
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
            folder_watch(folder)
-            item_dl(subsources, folder)
+            try:
                item_dl(subsources, folder)
                if log_enabled:
                    logging.info(f"Downloaded {url}{href}") 
                sleep(2)
            except HTTPSConnectionPool as ConnErr:
                if log_enabled:
                    logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
                else:
                    print(f"Got Error {ConErr}, pipes must be clogged lulz")
    except Exception as e:
-        print(f"Houston, we had a problem: \n{e}")
+        print(f"Houston, we had a problem with {url} and {folder}: \n{e}")
@ -193,12 +215,29 @@ def dl_threads(folder, url):
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
 if args.logfile:
    logfile = args.logfile
    #Creating Logfile
    logging.basicConfig(\
            format='%(asctime)s %(levelname)-8s %(message)s',\
            filename=f"{path[0]}/{args.logfile}",\
            level = logging.INFO,\
            datefmt='%Y-%m-%d %H:%M:%S'\
            )
    log_enabled = True
 else:
    log_enabled = False
 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
 sources = scraper(soup)
 folder_create(folder)
 folder_content = ""
 print("Lurking...")
 #item_dl(sources, folder)
 #Dling all threads found
@ -207,10 +246,10 @@ folder_content = ""
 if not args.constant:
    for href in hrefs:
        folder_watch(folder)
-        dl_threads(folder_url)
+        dl_threads(folder, url, log_enabled)
 else:
    while True:
            folder_watch(folder)
            dl_threads(folder, url)
-            
+            sleep(60)            
-
+print('Sayonara')
--- a/scrape.py
+++ b/scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
 #Connexion web
 import mechanicalsoup
 import wget
-from os import mkdir
+from os import mkdir, listdir
 '''
 ############## 4chan thread scrapper ################
-Give me a page, i'll download all of its pictures !
+Give me a 4chan thread, i'll download all of its pictures !
 '''
@ -57,10 +57,10 @@ def scraper(soup):
    Returns:
    - sources : A list of image sources
    '''
-    tags=soup.findAll('img')
+    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
-        sources.append(tag['src'])
+        sources.append(tag['href'])
    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):
    for source in sources:
        fullsource = "http://" + source
-        name = wget.download(fullsource, out=dlfolder)
+        if dlfolder[-1] == "/":
-        print(f"{name} downloaded")
+            dlfolder = dlfolder[:-1]
        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
            name = wget.download(fullsource, out=dlfolder)
            print(f"{name} downloaded")
        else:
            print(f"{source} is already there")
 def deduplicate(url, folder):
    '''
    Takes a url to an image and a folder, check if said image exists in said folder
    Args:
    - url : a str containing a full url to an image
    - folder : Name of a folder in /Images
    Returns:
    - True : The image is already in the folder
    - False : The image is not in the folder
    '''
    image_name = url.split("/")[-1]
    image_name = image_name.split("?")[0]
    files = listdir(f"/Images/{folder}")
    for i in files:
        if i == image_name:
            return True
    return False
 args = getArgs()
Author	SHA1	Message	Date
justine	ca78d58458	Update 'scrape.py' Fixed dedupe	2021-08-12 00:54:29 +00:00
justine	b2e7deb058	Update 'scrape.py' Added dedup	2021-08-12 00:26:36 +00:00
justine	e06953ccf6	Update '4chanthreadfinder.py' Change loglevel to INFO as not be flooded by mechanicalSoup	2021-02-19 11:00:15 +00:00
justine	77c20d67f1	Update '4chanthreadfinder.py' Added logging	2021-02-19 10:52:22 +00:00
justine	52fdd4f4b1	Update '4chanthreadfinder.py' Added multiple keywords support	2021-02-10 14:26:26 +00:00
justine	a97067d452	Update '4chanthreadfinder.py' Bugfixes	2021-02-09 20:19:48 +00:00
Justine	aa67b222d8	menage	2021-02-09 21:49:12 +01:00