Update 'scrape.py'
Added dedup
This commit is contained in:
parent
e06953ccf6
commit
b2e7deb058
34
scrape.py
34
scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
|
||||
#Connexion web
|
||||
import mechanicalsoup
|
||||
import wget
|
||||
from os import mkdir
|
||||
from os import mkdir, listdir
|
||||
'''
|
||||
############## 4chan thread scrapper ################
|
||||
Give me a page, i'll download all of its pictures !
|
||||
Give me a 4chan thread, i'll download all of its pictures !
|
||||
'''
|
||||
|
||||
|
||||
@ -57,10 +57,10 @@ def scraper(soup):
|
||||
Returns:
|
||||
- sources : A list of image sources
|
||||
'''
|
||||
tags=soup.findAll('img')
|
||||
tags=soup.findAll('a', {"class": "fileThumb"})
|
||||
sources = []
|
||||
for tag in tags:
|
||||
sources.append(tag['src'])
|
||||
sources.append(tag['href'])
|
||||
|
||||
#Got image sources, removing any left http headers
|
||||
for index, item in enumerate(sources):
|
||||
@ -84,8 +84,30 @@ def item_dl(sources, dlfolder):
|
||||
|
||||
for source in sources:
|
||||
fullsource = "http://" + source
|
||||
name = wget.download(fullsource, out=dlfolder)
|
||||
print(f"{name} downloaded")
|
||||
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
|
||||
name = wget.download(fullsource, out=dlfolder)
|
||||
print(f"{name} downloaded")
|
||||
else:
|
||||
print(f"{source} is already there")
|
||||
|
||||
def deduplicate(url, folder):
|
||||
'''
|
||||
Takes a url to an image and a folder, check if said image exists in said folder
|
||||
Args:
|
||||
- url : a str containing a full url to an image
|
||||
- folder : Name of a folder in /Images
|
||||
Returns:
|
||||
- True : The image is already in the folder
|
||||
- False : The image is not in the folder
|
||||
'''
|
||||
image_name = url.split("/")[-1]
|
||||
image_name = image_name.split("?")[0]
|
||||
files = listdir(f"/Images/{folder}")
|
||||
|
||||
for i in files:
|
||||
if i == image_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
args = getArgs()
|
||||
|
Reference in New Issue
Block a user