diff --git a/scrape.py b/scrape.py index 5e791f5..3e1f033 100755 --- a/scrape.py +++ b/scrape.py @@ -6,10 +6,10 @@ from bs4 import BeautifulSoup #Connexion web import mechanicalsoup import wget -from os import mkdir +from os import mkdir, listdir ''' ############## 4chan thread scrapper ################ -Give me a page, i'll download all of its pictures ! +Give me a 4chan thread, i'll download all of its pictures ! ''' @@ -57,10 +57,10 @@ def scraper(soup): Returns: - sources : A list of image sources ''' - tags=soup.findAll('img') + tags=soup.findAll('a', {"class": "fileThumb"}) sources = [] for tag in tags: - sources.append(tag['src']) + sources.append(tag['href']) #Got image sources, removing any left http headers for index, item in enumerate(sources): @@ -84,8 +84,30 @@ def item_dl(sources, dlfolder): for source in sources: fullsource = "http://" + source - name = wget.download(fullsource, out=dlfolder) - print(f"{name} downloaded") + if not deduplicate(fullsource, dlfolder.split("/")[-1]): + name = wget.download(fullsource, out=dlfolder) + print(f"{name} downloaded") + else: + print(f"{source} is already there") + +def deduplicate(url, folder): + ''' + Takes a url to an image and a folder, check if said image exists in said folder + Args: + - url : a str containing a full url to an image + - folder : Name of a folder in /Images + Returns: + - True : The image is already in the folder + - False : The image is not in the folder + ''' + image_name = url.split("/")[-1] + image_name = image_name.split("?")[0] + files = listdir(f"/Images/{folder}") + + for i in files: + if i == image_name: + return True + return False args = getArgs()