Update 'scrape.py'

Added dedup
2021-08-12 00:26:36 +00:00 · 2021-08-12 00:26:36 +00:00 · b2e7deb058
commit b2e7deb058
parent e06953ccf6
1 changed files with 28 additions and 6 deletions
--- a/scrape.py
+++ b/scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
 #Connexion web
 import mechanicalsoup
 import wget
-from os import mkdir
+from os import mkdir, listdir
 '''
 ############## 4chan thread scrapper ################
-Give me a page, i'll download all of its pictures !
+Give me a 4chan thread, i'll download all of its pictures !
 '''


@ -57,10 +57,10 @@ def scraper(soup):
    Returns:
    - sources : A list of image sources
    '''
-    tags=soup.findAll('img')
+    tags=soup.findAll('a', {"class": "fileThumb"})
    sources = []
    for tag in tags:
-        sources.append(tag['src'])
+        sources.append(tag['href'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):
@ -84,8 +84,30 @@ def item_dl(sources, dlfolder):

    for source in sources:
        fullsource = "http://" + source
-        name = wget.download(fullsource, out=dlfolder)
-        print(f"{name} downloaded")
+        if not deduplicate(fullsource, dlfolder.split("/")[-1]):
+            name = wget.download(fullsource, out=dlfolder)
+            print(f"{name} downloaded")
+        else:
+            print(f"{source} is already there")
+
+def deduplicate(url, folder):
+    '''
+    Takes a url to an image and a folder, check if said image exists in said folder
+    Args:
+    - url : a str containing a full url to an image
+    - folder : Name of a folder in /Images
+    Returns:
+    - True : The image is already in the folder
+    - False : The image is not in the folder
+    '''
+    image_name = url.split("/")[-1]
+    image_name = image_name.split("?")[0]
+    files = listdir(f"/Images/{folder}")
+
+    for i in files:
+        if i == image_name:
+            return True
+    return False


 args = getArgs()