This repository has been archived on 2021-12-02. You can view files and clone it, but cannot push or open issues or pull requests.
ImageScrapper/scrape.py
2021-08-12 00:54:29 +00:00

123 lines
3.3 KiB
Python
Executable File

#!/usr/bin/env python3
#coding: utf8
#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
from os import mkdir, listdir
'''
############## 4chan thread scrapper ################
Give me a 4chan thread, i'll download all of its pictures !
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
Returns:
-args : an args object containing all the optional arguments passed to the script.
'''
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
#Creating the args object
args=parser.parse_args()
return args
def html_get(url):
'''
Get html from the webpage
Args:
- url : a str containing url to scrap
Returns:
- page.soup: A BeautifulSoup object containing html
'''
#Browser
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
print(f"Got html from {url}")
return page.soup
except Exception as e:
print(f"Got error {e}")
def scraper(soup):
'''
Scrape a bs4 html object, find posts w/ images and get full size source
Args:
- soup : bs4 soup item
- item : tag to find
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
sources[index] = item.replace("http://", "")
sources[index] = item.replace("https://", "")
sources[index] = item.replace("//", "")
print(f"{len(sources)} images found")
return sources
def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder, which we try to create"
Args:
- sources : a list of URLs
'''
try:
mkdir(dlfolder)
except FileExistsError:
print(f"{dlfolder} already exists, not creating")
for source in sources:
fullsource = "http://" + source
if dlfolder[-1] == "/":
dlfolder = dlfolder[:-1]
if not deduplicate(fullsource, dlfolder.split("/")[-1]):
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
else:
print(f"{source} is already there")
def deduplicate(url, folder):
'''
Takes a url to an image and a folder, check if said image exists in said folder
Args:
- url : a str containing a full url to an image
- folder : Name of a folder in /Images
Returns:
- True : The image is already in the folder
- False : The image is not in the folder
'''
image_name = url.split("/")[-1]
image_name = image_name.split("?")[0]
files = listdir(f"/Images/{folder}")
for i in files:
if i == image_name:
return True
return False
args = getArgs()
folder = args.folder
url = args.url
#url = "https://boards.4chan.org/b/"
#url = 'https://www.deviantart.com/'
soup = html_get(url)
sources = scraper(soup)
item_dl(sources, folder)