first
93
4chanscrape.py
Executable file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
#coding: utf8
|
||||
|
||||
#Scraper
|
||||
from bs4 import BeautifulSoup
|
||||
#Connexion web
|
||||
import mechanicalsoup
|
||||
import wget
|
||||
|
||||
'''
|
||||
############## 4chan thread scrapper ################
|
||||
Give me a thread, i'll download all of its pictures !
|
||||
'''
|
||||
|
||||
|
||||
def getArgs():
|
||||
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
||||
No args
|
||||
Returns:
|
||||
-args : an args object containing all the optional arguments passed to the script.
|
||||
'''
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
|
||||
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
|
||||
#Creating the args object
|
||||
args=parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def html_get(url):
|
||||
'''
|
||||
Get html from the webpage
|
||||
Args:
|
||||
- url : a str containing url to scrap
|
||||
Returns:
|
||||
- page.soup: A BeautifulSoup object containing html
|
||||
'''
|
||||
#Browser
|
||||
browser = mechanicalsoup.Browser()
|
||||
try:
|
||||
page = browser.get(url)
|
||||
print(f"Got html from {url}")
|
||||
return page.soup
|
||||
except Exception as e:
|
||||
print(f"Got error {e}")
|
||||
|
||||
|
||||
def scraper(soup):
|
||||
'''
|
||||
Scrape a bs4 html object, find posts w/ images and get full size source
|
||||
Args:
|
||||
- soup : bs4 soup item
|
||||
- item : tag to find
|
||||
Returns:
|
||||
- sources : A list of image sources
|
||||
'''
|
||||
tags=soup.findAll('a', {"class": "fileThumb"})
|
||||
sources = []
|
||||
for tag in tags:
|
||||
sources.append(tag['href'])
|
||||
|
||||
#Got image sources, removing any left http headers
|
||||
for index, item in enumerate(sources):
|
||||
sources[index] = item.replace("http://", "")
|
||||
sources[index] = item.replace("https://", "")
|
||||
sources[index] = item.replace("//", "")
|
||||
print(f"{len(sources)} images found")
|
||||
return sources
|
||||
|
||||
|
||||
def item_dl(sources, dlfolder):
|
||||
'''
|
||||
Download all items in the sources list to folder dlfolder"
|
||||
Args:
|
||||
- sources : a list of URLs
|
||||
'''
|
||||
for source in sources:
|
||||
fullsource = "http://" + source
|
||||
name = wget.download(fullsource, out=dlfolder)
|
||||
print(f"{name} downloaded")
|
||||
|
||||
|
||||
args = getArgs()
|
||||
folder = args.folder
|
||||
url = args.url
|
||||
#url = "https://boards.4chan.org/b/"
|
||||
#url = 'https://www.deviantart.com/'
|
||||
soup = html_get(url)
|
||||
sources = scraper(soup)
|
||||
item_dl(sources, folder)
|
20
README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Scrappers
|
||||
|
||||
Two scrappers:
|
||||
* The 4chan one dls all images from a thread in best res
|
||||
* The other one simply looks for "img" in any given page and downloads images
|
||||
|
||||
Install depedencies:
|
||||
|
||||
```
|
||||
python3 -m pip install beautifulsoup4 mechanicalsoup wget --user
|
||||
```
|
||||
|
||||
Use:
|
||||
|
||||
```
|
||||
./4chanscrape.py -u https://boards.4channel.org/c/thread/3846676/gunsmith-cats-thread -f ./downloads
|
||||
```
|
||||
|
||||
* -u : URL of the page
|
||||
* -f : folder where you want to download all pictures
|
BIN
downloads/1612495733903.jpg
Normal file
After Width: | Height: | Size: 228 KiB |
BIN
downloads/1612495889126.jpg
Normal file
After Width: | Height: | Size: 19 KiB |
BIN
downloads/1612496273920.jpg
Normal file
After Width: | Height: | Size: 772 KiB |
BIN
downloads/1612496326357.png
Normal file
After Width: | Height: | Size: 930 KiB |
BIN
downloads/1612496427354.png
Normal file
After Width: | Height: | Size: 984 KiB |
BIN
downloads/1612496458053.jpg
Normal file
After Width: | Height: | Size: 60 KiB |
BIN
downloads/1612496491214.png
Normal file
After Width: | Height: | Size: 966 KiB |
BIN
downloads/1612496545051.jpg
Normal file
After Width: | Height: | Size: 161 KiB |
BIN
downloads/1612496585654.jpg
Normal file
After Width: | Height: | Size: 267 KiB |
BIN
downloads/1612496617905.jpg
Normal file
After Width: | Height: | Size: 194 KiB |
BIN
downloads/1612498171829.png
Normal file
After Width: | Height: | Size: 700 KiB |
BIN
downloads/1612498234647.jpg
Normal file
After Width: | Height: | Size: 151 KiB |
BIN
downloads/1612498296481.jpg
Normal file
After Width: | Height: | Size: 190 KiB |
BIN
downloads/1612498357605.jpg
Normal file
After Width: | Height: | Size: 133 KiB |
BIN
downloads/1612498614377.jpg
Normal file
After Width: | Height: | Size: 62 KiB |
BIN
downloads/1612498872184.png
Normal file
After Width: | Height: | Size: 1.8 MiB |
BIN
downloads/1612498944896.jpg
Normal file
After Width: | Height: | Size: 37 KiB |
BIN
downloads/1612499007394.jpg
Normal file
After Width: | Height: | Size: 91 KiB |
BIN
downloads/1612499068840.png
Normal file
After Width: | Height: | Size: 158 KiB |
BIN
downloads/1612499129413.jpg
Normal file
After Width: | Height: | Size: 26 KiB |
BIN
downloads/1612500378091.jpg
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
downloads/1612501855596.jpg
Normal file
After Width: | Height: | Size: 167 KiB |
BIN
downloads/1612542694607.jpg
Normal file
After Width: | Height: | Size: 243 KiB |
BIN
downloads/1612542757961.jpg
Normal file
After Width: | Height: | Size: 285 KiB |
BIN
downloads/1612542819567.jpg
Normal file
After Width: | Height: | Size: 215 KiB |
BIN
downloads/1612542880894.jpg
Normal file
After Width: | Height: | Size: 266 KiB |
BIN
downloads/1612542942459.jpg
Normal file
After Width: | Height: | Size: 201 KiB |
BIN
downloads/1612546870980.png
Normal file
After Width: | Height: | Size: 2.3 MiB |
BIN
downloads/1612549708543.jpg
Normal file
After Width: | Height: | Size: 224 KiB |
BIN
downloads/1612549769793.jpg
Normal file
After Width: | Height: | Size: 211 KiB |
BIN
downloads/1612587310966.jpg
Normal file
After Width: | Height: | Size: 294 KiB |
BIN
downloads/1612587373905.jpg
Normal file
After Width: | Height: | Size: 590 KiB |
BIN
downloads/1612587435764.jpg
Normal file
After Width: | Height: | Size: 514 KiB |
BIN
downloads/1612587498221.jpg
Normal file
After Width: | Height: | Size: 445 KiB |
BIN
downloads/1612660763418.jpg
Normal file
After Width: | Height: | Size: 247 KiB |
BIN
downloads/1612660825853.jpg
Normal file
After Width: | Height: | Size: 250 KiB |
BIN
downloads/1612679388338.png
Normal file
After Width: | Height: | Size: 1.0 MiB |
BIN
downloads/1612721086476.jpg
Normal file
After Width: | Height: | Size: 107 KiB |
BIN
downloads/1612721838882.png
Normal file
After Width: | Height: | Size: 9.2 KiB |
BIN
downloads/1612721863881.jpg
Normal file
After Width: | Height: | Size: 103 KiB |
BIN
downloads/1612721920904.jpg
Normal file
After Width: | Height: | Size: 143 KiB |
BIN
downloads/1612722054983.png
Normal file
After Width: | Height: | Size: 573 KiB |
BIN
downloads/1612722082481.png
Normal file
After Width: | Height: | Size: 482 KiB |
BIN
downloads/1612722415043.jpg
Normal file
After Width: | Height: | Size: 137 KiB |
BIN
downloads/1612722444002.jpg
Normal file
After Width: | Height: | Size: 34 KiB |
BIN
downloads/1612722505368.jpg
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
downloads/1612722858144.gif
Normal file
After Width: | Height: | Size: 2.0 MiB |
BIN
downloads/1612723284202.jpg
Normal file
After Width: | Height: | Size: 84 KiB |
BIN
downloads/1612723314446.jpg
Normal file
After Width: | Height: | Size: 444 KiB |
BIN
downloads/1612723453700.jpg
Normal file
After Width: | Height: | Size: 1.6 MiB |
BIN
downloads/1612723516899.jpg
Normal file
After Width: | Height: | Size: 188 KiB |
BIN
downloads/1612723578590.jpg
Normal file
After Width: | Height: | Size: 107 KiB |
BIN
downloads/1612723639738.jpg
Normal file
After Width: | Height: | Size: 123 KiB |
93
scrape.py
Executable file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
#coding: utf8
|
||||
|
||||
#Scraper
|
||||
from bs4 import BeautifulSoup
|
||||
#Connexion web
|
||||
import mechanicalsoup
|
||||
import wget
|
||||
|
||||
'''
|
||||
############## image scrapper ################
|
||||
Give me a url, i'll download all of its pictures !
|
||||
'''
|
||||
|
||||
|
||||
def getArgs():
|
||||
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
|
||||
No args
|
||||
Returns:
|
||||
-args : an args object containing all the optional arguments passed to the script.
|
||||
'''
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
|
||||
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
|
||||
#Creating the args object
|
||||
args=parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def html_get(url):
|
||||
'''
|
||||
Get html from the webpage
|
||||
Args:
|
||||
- url : a str containing url to scrap
|
||||
Returns:
|
||||
- page.soup: A BeautifulSoup object containing html
|
||||
'''
|
||||
#Browser
|
||||
browser = mechanicalsoup.Browser()
|
||||
try:
|
||||
page = browser.get(url)
|
||||
print(f"Got html from {url}")
|
||||
return page.soup
|
||||
except Exception as e:
|
||||
print(f"Got error {e}")
|
||||
|
||||
|
||||
def scraper(soup):
|
||||
'''
|
||||
Scrape a bs4 html object, find posts w/ images and get full size source
|
||||
Args:
|
||||
- soup : bs4 soup item
|
||||
- item : tag to find
|
||||
Returns:
|
||||
- sources : A list of image sources
|
||||
'''
|
||||
tags=soup.findAll('img')
|
||||
sources = []
|
||||
for tag in tags:
|
||||
sources.append(tag['href'])
|
||||
|
||||
#Got image sources, removing any left http headers
|
||||
for index, item in enumerate(sources):
|
||||
sources[index] = item.replace("http://", "")
|
||||
sources[index] = item.replace("https://", "")
|
||||
sources[index] = item.replace("//", "")
|
||||
print(f"{len(sources)} images found")
|
||||
return sources
|
||||
|
||||
|
||||
def item_dl(sources, dlfolder):
|
||||
'''
|
||||
Download all items in the sources list to folder dlfolder"
|
||||
Args:
|
||||
- sources : a list of URLs
|
||||
'''
|
||||
for source in sources:
|
||||
fullsource = "http://" + source
|
||||
name = wget.download(fullsource, out=dlfolder)
|
||||
print(f"{name} downloaded")
|
||||
|
||||
|
||||
args = getArgs()
|
||||
folder = args.folder
|
||||
url = args.url
|
||||
#url = "https://boards.4chan.org/b/"
|
||||
#url = 'https://www.deviantart.com/'
|
||||
soup = html_get(url)
|
||||
sources = scraper(soup)
|
||||
item_dl(sources, folder)
|