94 lines
2.3 KiB
Python
Executable File
94 lines
2.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#coding: utf8
|
|
|
|
#Scraper
|
|
from bs4 import BeautifulSoup
|
|
#Connexion web
|
|
import mechanicalsoup
|
|
import wget
|
|
|
|
'''
|
|
############## image scrapper ################
|
|
Give me a url, i'll download all of its pictures !
|
|
'''
|
|
|
|
|
|
def getArgs():
|
|
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
|
No args
|
|
Returns:
|
|
-args : an args object containing all the optional arguments passed to the script.
|
|
'''
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
|
|
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
|
|
#Creating the args object
|
|
args=parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def html_get(url):
|
|
'''
|
|
Get html from the webpage
|
|
Args:
|
|
- url : a str containing url to scrap
|
|
Returns:
|
|
- page.soup: A BeautifulSoup object containing html
|
|
'''
|
|
#Browser
|
|
browser = mechanicalsoup.Browser()
|
|
try:
|
|
page = browser.get(url)
|
|
print(f"Got html from {url}")
|
|
return page.soup
|
|
except Exception as e:
|
|
print(f"Got error {e}")
|
|
|
|
|
|
def scraper(soup):
|
|
'''
|
|
Scrape a bs4 html object, find posts w/ images and get full size source
|
|
Args:
|
|
- soup : bs4 soup item
|
|
- item : tag to find
|
|
Returns:
|
|
- sources : A list of image sources
|
|
'''
|
|
tags=soup.findAll('img')
|
|
sources = []
|
|
for tag in tags:
|
|
sources.append(tag['href'])
|
|
|
|
#Got image sources, removing any left http headers
|
|
for index, item in enumerate(sources):
|
|
sources[index] = item.replace("http://", "")
|
|
sources[index] = item.replace("https://", "")
|
|
sources[index] = item.replace("//", "")
|
|
print(f"{len(sources)} images found")
|
|
return sources
|
|
|
|
|
|
def item_dl(sources, dlfolder):
|
|
'''
|
|
Download all items in the sources list to folder dlfolder"
|
|
Args:
|
|
- sources : a list of URLs
|
|
'''
|
|
for source in sources:
|
|
fullsource = "http://" + source
|
|
name = wget.download(fullsource, out=dlfolder)
|
|
print(f"{name} downloaded")
|
|
|
|
|
|
args = getArgs()
|
|
folder = args.folder
|
|
url = args.url
|
|
#url = "https://boards.4chan.org/b/"
|
|
#url = 'https://www.deviantart.com/'
|
|
soup = html_get(url)
|
|
sources = scraper(soup)
|
|
item_dl(sources, folder)
|