Update 'scrape.py'

Added dedup
This commit is contained in:
justine 2021-08-12 00:26:36 +00:00
parent e06953ccf6
commit b2e7deb058

View File

@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir
from os import mkdir, listdir
'''
############## 4chan thread scrapper ################
Give me a page, i'll download all of its pictures !
Give me a 4chan thread, i'll download all of its pictures !
'''
@ -57,10 +57,10 @@ def scraper(soup):
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('img')
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['src'])
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
@ -84,8 +84,30 @@ def item_dl(sources, dlfolder):
for source in sources:
fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
else:
print(f"{source} is already there")
def deduplicate(url, folder):
'''
Takes a url to an image and a folder, check if said image exists in said folder
Args:
- url : a str containing a full url to an image
- folder : Name of a folder in /Images
Returns:
- True : The image is already in the folder
- False : The image is not in the folder
'''
image_name = url.split("/")[-1]
image_name = image_name.split("?")[0]
files = listdir(f"/Images/{folder}")
for i in files:
if i == image_name:
return True
return False
args = getArgs()