This commit is contained in:
Justine 2021-02-07 21:00:49 +01:00
commit f4ca5d5835
56 changed files with 206 additions and 0 deletions

93
4chanscrape.py Executable file
View File

@ -0,0 +1,93 @@
#!/usr/bin/env python3
#coding: utf8
#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
'''
############## 4chan thread scrapper ################
Give me a thread, i'll download all of its pictures !
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
Returns:
-args : an args object containing all the optional arguments passed to the script.
'''
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
#Creating the args object
args=parser.parse_args()
return args
def html_get(url):
'''
Get html from the webpage
Args:
- url : a str containing url to scrap
Returns:
- page.soup: A BeautifulSoup object containing html
'''
#Browser
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
print(f"Got html from {url}")
return page.soup
except Exception as e:
print(f"Got error {e}")
def scraper(soup):
'''
Scrape a bs4 html object, find posts w/ images and get full size source
Args:
- soup : bs4 soup item
- item : tag to find
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('a', {"class": "fileThumb"})
sources = []
for tag in tags:
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
sources[index] = item.replace("http://", "")
sources[index] = item.replace("https://", "")
sources[index] = item.replace("//", "")
print(f"{len(sources)} images found")
return sources
def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder"
Args:
- sources : a list of URLs
'''
for source in sources:
fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
args = getArgs()
folder = args.folder
url = args.url
#url = "https://boards.4chan.org/b/"
#url = 'https://www.deviantart.com/'
soup = html_get(url)
sources = scraper(soup)
item_dl(sources, folder)

20
README.md Normal file
View File

@ -0,0 +1,20 @@
# Scrappers
Two scrappers:
* The 4chan one dls all images from a thread in best res
* The other one simply looks for "img" in any given page and downloads images
Install depedencies:
```
python3 -m pip install beautifulsoup4 mechanicalsoup wget --user
```
Use:
```
./4chanscrape.py -u https://boards.4channel.org/c/thread/3846676/gunsmith-cats-thread -f ./downloads
```
* -u : URL of the page
* -f : folder where you want to download all pictures

BIN
downloads/1612495733903.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 228 KiB

BIN
downloads/1612495889126.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

BIN
downloads/1612496273920.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 772 KiB

BIN
downloads/1612496326357.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 930 KiB

BIN
downloads/1612496427354.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 984 KiB

BIN
downloads/1612496458053.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

BIN
downloads/1612496491214.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 966 KiB

BIN
downloads/1612496545051.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 161 KiB

BIN
downloads/1612496585654.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 267 KiB

BIN
downloads/1612496617905.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

BIN
downloads/1612498171829.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 700 KiB

BIN
downloads/1612498234647.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 151 KiB

BIN
downloads/1612498296481.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 190 KiB

BIN
downloads/1612498357605.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

BIN
downloads/1612498614377.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

BIN
downloads/1612498872184.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

BIN
downloads/1612498944896.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

BIN
downloads/1612499007394.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

BIN
downloads/1612499068840.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 158 KiB

BIN
downloads/1612499129413.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
downloads/1612500378091.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

BIN
downloads/1612501855596.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

BIN
downloads/1612542694607.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 243 KiB

BIN
downloads/1612542757961.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 285 KiB

BIN
downloads/1612542819567.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 215 KiB

BIN
downloads/1612542880894.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 266 KiB

BIN
downloads/1612542942459.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 201 KiB

BIN
downloads/1612546870980.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 MiB

BIN
downloads/1612549708543.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

BIN
downloads/1612549769793.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 211 KiB

BIN
downloads/1612587310966.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 294 KiB

BIN
downloads/1612587373905.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 590 KiB

BIN
downloads/1612587435764.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 KiB

BIN
downloads/1612587498221.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 445 KiB

BIN
downloads/1612660763418.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 247 KiB

BIN
downloads/1612660825853.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

BIN
downloads/1612679388338.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
downloads/1612721086476.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
downloads/1612721838882.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

BIN
downloads/1612721863881.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

BIN
downloads/1612721920904.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

BIN
downloads/1612722054983.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 573 KiB

BIN
downloads/1612722082481.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 482 KiB

BIN
downloads/1612722415043.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 137 KiB

BIN
downloads/1612722444002.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
downloads/1612722505368.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

BIN
downloads/1612722858144.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 MiB

BIN
downloads/1612723284202.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

BIN
downloads/1612723314446.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 444 KiB

BIN
downloads/1612723453700.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

BIN
downloads/1612723516899.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 188 KiB

BIN
downloads/1612723578590.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
downloads/1612723639738.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

93
scrape.py Executable file
View File

@ -0,0 +1,93 @@
#!/usr/bin/env python3
#coding: utf8
#Scraper
from bs4 import BeautifulSoup
#Connexion web
import mechanicalsoup
import wget
'''
############## image scrapper ################
Give me a url, i'll download all of its pictures !
'''
def getArgs():
'''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
No args
Returns:
-args : an args object containing all the optional arguments passed to the script.
'''
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
#Creating the args object
args=parser.parse_args()
return args
def html_get(url):
'''
Get html from the webpage
Args:
- url : a str containing url to scrap
Returns:
- page.soup: A BeautifulSoup object containing html
'''
#Browser
browser = mechanicalsoup.Browser()
try:
page = browser.get(url)
print(f"Got html from {url}")
return page.soup
except Exception as e:
print(f"Got error {e}")
def scraper(soup):
'''
Scrape a bs4 html object, find posts w/ images and get full size source
Args:
- soup : bs4 soup item
- item : tag to find
Returns:
- sources : A list of image sources
'''
tags=soup.findAll('img')
sources = []
for tag in tags:
sources.append(tag['href'])
#Got image sources, removing any left http headers
for index, item in enumerate(sources):
sources[index] = item.replace("http://", "")
sources[index] = item.replace("https://", "")
sources[index] = item.replace("//", "")
print(f"{len(sources)} images found")
return sources
def item_dl(sources, dlfolder):
'''
Download all items in the sources list to folder dlfolder"
Args:
- sources : a list of URLs
'''
for source in sources:
fullsource = "http://" + source
name = wget.download(fullsource, out=dlfolder)
print(f"{name} downloaded")
args = getArgs()
folder = args.folder
url = args.url
#url = "https://boards.4chan.org/b/"
#url = 'https://www.deviantart.com/'
soup = html_get(url)
sources = scraper(soup)
item_dl(sources, folder)