Update 'scrape.py'
Added dedup
This commit is contained in:
parent
e06953ccf6
commit
b2e7deb058
30
scrape.py
30
scrape.py
@ -6,10 +6,10 @@ from bs4 import BeautifulSoup
|
|||||||
#Connexion web
|
#Connexion web
|
||||||
import mechanicalsoup
|
import mechanicalsoup
|
||||||
import wget
|
import wget
|
||||||
from os import mkdir
|
from os import mkdir, listdir
|
||||||
'''
|
'''
|
||||||
############## 4chan thread scrapper ################
|
############## 4chan thread scrapper ################
|
||||||
Give me a page, i'll download all of its pictures !
|
Give me a 4chan thread, i'll download all of its pictures !
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
@ -57,10 +57,10 @@ def scraper(soup):
|
|||||||
Returns:
|
Returns:
|
||||||
- sources : A list of image sources
|
- sources : A list of image sources
|
||||||
'''
|
'''
|
||||||
tags=soup.findAll('img')
|
tags=soup.findAll('a', {"class": "fileThumb"})
|
||||||
sources = []
|
sources = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
sources.append(tag['src'])
|
sources.append(tag['href'])
|
||||||
|
|
||||||
#Got image sources, removing any left http headers
|
#Got image sources, removing any left http headers
|
||||||
for index, item in enumerate(sources):
|
for index, item in enumerate(sources):
|
||||||
@ -84,8 +84,30 @@ def item_dl(sources, dlfolder):
|
|||||||
|
|
||||||
for source in sources:
|
for source in sources:
|
||||||
fullsource = "http://" + source
|
fullsource = "http://" + source
|
||||||
|
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
|
||||||
name = wget.download(fullsource, out=dlfolder)
|
name = wget.download(fullsource, out=dlfolder)
|
||||||
print(f"{name} downloaded")
|
print(f"{name} downloaded")
|
||||||
|
else:
|
||||||
|
print(f"{source} is already there")
|
||||||
|
|
||||||
|
def deduplicate(url, folder):
|
||||||
|
'''
|
||||||
|
Takes a url to an image and a folder, check if said image exists in said folder
|
||||||
|
Args:
|
||||||
|
- url : a str containing a full url to an image
|
||||||
|
- folder : Name of a folder in /Images
|
||||||
|
Returns:
|
||||||
|
- True : The image is already in the folder
|
||||||
|
- False : The image is not in the folder
|
||||||
|
'''
|
||||||
|
image_name = url.split("/")[-1]
|
||||||
|
image_name = image_name.split("?")[0]
|
||||||
|
files = listdir(f"/Images/{folder}")
|
||||||
|
|
||||||
|
for i in files:
|
||||||
|
if i == image_name:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
args = getArgs()
|
args = getArgs()
|
||||||
|
Reference in New Issue
Block a user