Update 'scrape.py'

Fixed dedupe
Update 'scrape.py'
2021-08-12 00:54:29 +00:00 · 2021-08-12 00:26:36 +00:00 · 2021-02-19 11:00:15 +00:00 · 2021-02-19 10:52:22 +00:00 · 2021-02-10 14:26:26 +00:00 · 2021-02-09 20:19:48 +00:00
14 changed files with 91 additions and 28 deletions
--- a/1612736548910.pngyc4rlphz.tmp
+++ b/1612736548910.pngyc4rlphz.tmp
--- a/1612800279774.jpgg1ir4h9g.tmp
+++ b/1612800279774.jpgg1ir4h9g.tmp
--- a/1612800847218.jpgz00i3nd3.tmp
+++ b/1612800847218.jpgz00i3nd3.tmp
--- a/1612801102502.jpg7bewagr3.tmp
+++ b/1612801102502.jpg7bewagr3.tmp
--- a/1612801206104.jpgn8w76bsu.tmp
+++ b/1612801206104.jpgn8w76bsu.tmp
--- a/1612801259880.jpg66nlexsl.tmp
+++ b/1612801259880.jpg66nlexsl.tmp
--- a/1612801259880.jpg6__c8xd4.tmp
+++ b/1612801259880.jpg6__c8xd4.tmp
--- a/1612801442460.jpguqt7rcl2.tmp
+++ b/1612801442460.jpguqt7rcl2.tmp
--- a/1612801753931.jpgebpmo83j.tmp
+++ b/1612801753931.jpgebpmo83j.tmp
--- a/1612801753931.jpgr25ymlkk.tmp
+++ b/1612801753931.jpgr25ymlkk.tmp
--- a/1612801818460.jpgyn5eonf9.tmp
+++ b/1612801818460.jpgyn5eonf9.tmp
--- a/1612801951857.jpgm0h3hv2r.tmp
+++ b/1612801951857.jpgm0h3hv2r.tmp
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
 import mechanicalsoup
 import wget
 from os import mkdir, listdir
+from sys import path
 from re import findall
 from time import sleep
 from threading import Thread
+import logging

 '''
 ############## 4chan thread scrapper ################
@ -38,7 +40,8 @@ def getArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
-    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'. Argument can be used multiple times", action='append', required=True)
+    parser.add_argument("-l", "--logfile", help="Name of the logfile. Please provide a name that is not already in use.", type=str, required=False)
    parser.add_argument("-c", "--constant", help = "Constantly download", action="store_true")

    #Creating the args object
@ -51,7 +54,7 @@ def thread_finder(homepage, keyword):
    returns a list of all the threads where op said keyword on 4chan homepage
    Args:
    - homepage: bs4 soup object containing html from the homepage of the board
-    - keyword : any single word
+    - keyword : list of expressions to look for
    Returns:
    - hrefs : all the references to matching threads. They must be appended to homepage to work
    '''
@ -62,11 +65,13 @@ def thread_finder(homepage, keyword):
    for thread in threads:
        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
        for text in texts:
-            if keyword.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
-                links = thread.findAll('a', {'title': 'Reply to this post'})
-                for link in links:
-                    hrefs.append(f"{link['href']}")
-    return hrefs
+            for word in keyword:
+                if word.lower() in text.text.lower() and "loli" not in text.text.lower() and "shota" not in text.text.lower():
+                    print(f"Found {word}")
+                    links = thread.findAll('a', {'title': 'Reply to this post'})
+                    for link in links:
+                        hrefs.append(f"{link['href']}")
+        return hrefs
    


@ -83,7 +88,6 @@ def html_get(url):
    browser = mechanicalsoup.Browser()
    try:
        page = browser.get(url)
-        print(f"Got html from {url}")
        return page.soup
    except Exception as e:
        print(f"Got error {e}")
@ -122,12 +126,6 @@ def item_dl(sources, dlfolder):
    
    global folder_content
    
-    try:
-    #Making folder
-        mkdir(dlfolder)
-    except FileExistsError:
-        print(f"{dlfolder} already exists, not creating")
-
    #Deduplicating
    imagenames = []

@ -136,10 +134,25 @@ def item_dl(sources, dlfolder):
        imagename = findall(r"[^\/]*$", source)[0]
        if imagename[:-4] not in folder_content:
            name = wget.download(fullsource, out=dlfolder)
-            print(f"{name} downloaded")
+            print(f"{name} downloaded from {source}")

    return True

+def folder_create(dlfolder):
+    '''
+    Create the folder if it does not exist
+    Args:
+    - dlfolder : path of folder to create
+    '''
+
+    try:
+    #Making folder
+        mkdir(dlfolder)
+    except FileExistsError:
+        print(f"{dlfolder} already exists, not creating")
+
+
+
 def folder_watch(folder):
    '''
    Watch for the content of a folder and return its content.
@ -160,12 +173,13 @@ def folder_watch(folder):
        folder_content += i


-def dl_threads(folder, url):
+def dl_threads(folder, url, log_enabled):
    '''
    Constantly download...
    Args: 
    - folder: folder to dl into
    - url : board to watch
+    - log_enabled : Set True if logging lib is used
    '''

    try:
@ -179,13 +193,21 @@ def dl_threads(folder, url):

        #oneshot
        for href in hrefs:
-            print(f"going after {url}{href}")
            subsoup = html_get(f"{url}{href}")
            subsources = scraper(subsoup)
            folder_watch(folder)
-            item_dl(subsources, folder)
+            try:
+                item_dl(subsources, folder)
+                if log_enabled:
+                    logging.info(f"Downloaded {url}{href}") 
+                sleep(2)
+            except HTTPSConnectionPool as ConnErr:
+                if log_enabled:
+                    logging.error(f"Got Error {ConErr}, pipes must be clogged lulz")
+                else:
+                    print(f"Got Error {ConErr}, pipes must be clogged lulz")
    except Exception as e:
-        print(f"Houston, we had a problem: \n{e}")
+        print(f"Houston, we had a problem with {url} and {folder}: \n{e}")



@ -193,12 +215,29 @@ def dl_threads(folder, url):
 args = getArgs()
 folder = args.folder
 keyword = args.keyword
+if args.logfile:
+    logfile = args.logfile
+
+    #Creating Logfile
+    logging.basicConfig(\
+            format='%(asctime)s %(levelname)-8s %(message)s',\
+            filename=f"{path[0]}/{args.logfile}",\
+            level = logging.INFO,\
+            datefmt='%Y-%m-%d %H:%M:%S'\
+            )
+    log_enabled = True
+else:
+    log_enabled = False
+

 url = args.url
 soup = html_get(url)
 hrefs = thread_finder(soup, keyword)
 sources = scraper(soup)
+folder_create(folder)
 folder_content = ""
+
+print("Lurking...")
 #item_dl(sources, folder)

 #Dling all threads found
@ -207,10 +246,10 @@ folder_content = ""
 if not args.constant:
    for href in hrefs:
        folder_watch(folder)
-        dl_threads(folder_url)
+        dl_threads(folder, url, log_enabled)
 else:
    while True:
            folder_watch(folder)
            dl_threads(folder, url)
-            
-
+            sleep(60)            
+print('Sayonara')
--- a/scrape.py
+++ b/scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
 #Connexion web
 import mechanicalsoup
 import wget
-from os import mkdir
+from os import mkdir, listdir
 '''
 ############## 4chan thread scrapper ################
-Give me a page, i'll download all of its pictures !
+Give me a 4chan thread, i'll download all of its pictures !
 '''


@ -57,10 +57,10 @@ def scraper(soup):
    Returns:
    - sources : A list of image sources
    '''
-    tags=soup.findAll('img')
+    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
-        sources.append(tag['src'])
+        sources.append(tag['href'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
@ -84,8 +84,32 @@ def item_dl(sources, dlfolder):

    for source in sources:
        fullsource = "http://" + source
-        name = wget.download(fullsource, out=dlfolder)
-        print(f"{name} downloaded")
+        if dlfolder[-1] == "/":
+            dlfolder = dlfolder[:-1]
+        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
+            name = wget.download(fullsource, out=dlfolder)
+            print(f"{name} downloaded")
+        else:
+            print(f"{source} is already there")
+
+def deduplicate(url, folder):
+    '''
+    Takes a url to an image and a folder, check if said image exists in said folder
+    Args:
+    - url : a str containing a full url to an image
+    - folder : Name of a folder in /Images
+    Returns:
+    - True : The image is already in the folder
+    - False : The image is not in the folder
+    '''
+    image_name = url.split("/")[-1]
+    image_name = image_name.split("?")[0]
+    files = listdir(f"/Images/{folder}")
+
+    for i in files:
+        if i == image_name:
+            return True
+    return False


 args = getArgs()
Author	SHA1	Message	Date
justine	ca78d58458	Update 'scrape.py' Fixed dedupe	2021-08-12 00:54:29 +00:00
justine	b2e7deb058	Update 'scrape.py' Added dedup	2021-08-12 00:26:36 +00:00
justine	e06953ccf6	Update '4chanthreadfinder.py' Change loglevel to INFO as not be flooded by mechanicalSoup	2021-02-19 11:00:15 +00:00
justine	77c20d67f1	Update '4chanthreadfinder.py' Added logging	2021-02-19 10:52:22 +00:00
justine	52fdd4f4b1	Update '4chanthreadfinder.py' Added multiple keywords support	2021-02-10 14:26:26 +00:00
justine	a97067d452	Update '4chanthreadfinder.py' Bugfixes	2021-02-09 20:19:48 +00:00
Justine	aa67b222d8	menage	2021-02-09 21:49:12 +01:00