first

2021-02-07 21:00:49 +01:00
commit f4ca5d5835
56 changed files with 206 additions and 0 deletions
--- a/4chanscrape.py
+++ b/4chanscrape.py
@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+#coding: utf8
+
+#Scraper
+from bs4 import BeautifulSoup
+#Connexion web
+import mechanicalsoup
+import wget
+
+'''
+############## 4chan thread scrapper ################
+Give me a thread, i'll download all of its pictures !
+'''
+
+
+def getArgs():
+    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
+    No args
+    Returns:
+    -args : an args object containing all the optional arguments passed to the script.
+    '''
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
+    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
+    #Creating the args object
+    args=parser.parse_args()
+
+    return args
+
+
+def html_get(url):
+    '''
+    Get html from the webpage
+    Args:
+    - url : a str containing url to scrap
+    Returns:
+    - page.soup: A BeautifulSoup object containing html
+    '''
+    #Browser
+    browser = mechanicalsoup.Browser()
+    try:
+        page = browser.get(url)
+        print(f"Got html from {url}")
+        return page.soup
+    except Exception as e:
+        print(f"Got error {e}")
+
+
+def scraper(soup):
+    '''
+    Scrape a bs4 html object, find posts w/ images and get full size source
+    Args:
+    - soup : bs4 soup item
+    - item : tag to find
+    Returns:
+    - sources : A list of image sources
+    '''
+    tags=soup.findAll('a', {"class": "fileThumb"})
+    sources = []
+    for tag in tags:
+        sources.append(tag['href'])
+
+    #Got image sources, removing any left http headers
+    for index, item in enumerate(sources):
+        sources[index] = item.replace("http://", "")
+        sources[index] = item.replace("https://", "")
+        sources[index] = item.replace("//", "")
+    print(f"{len(sources)} images found")
+    return sources
+
+
+def item_dl(sources, dlfolder):
+    '''
+    Download all items in the sources list to folder dlfolder"
+    Args:
+    - sources : a list of URLs
+    '''
+    for source in sources:
+        fullsource = "http://" + source
+        name = wget.download(fullsource, out=dlfolder)
+        print(f"{name} downloaded")
+
+
+args = getArgs()
+folder = args.folder
+url = args.url
+#url = "https://boards.4chan.org/b/"
+#url = 'https://www.deviantart.com/'
+soup = html_get(url)
+sources = scraper(soup)
+item_dl(sources, folder)
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
+# Scrappers
+
+Two scrappers:
+* The 4chan one dls all images from a thread in best res
+* The other one simply looks for "img" in any given page and downloads images
+
+Install depedencies:
+
+```
+python3 -m pip install beautifulsoup4 mechanicalsoup wget --user
+```
+
+Use:
+
+```
+./4chanscrape.py -u https://boards.4channel.org/c/thread/3846676/gunsmith-cats-thread -f ./downloads
+```
+
+* -u : URL of the page
+* -f : folder where you want to download all pictures
--- a/downloads/1612495733903.jpg
+++ b/downloads/1612495733903.jpg
--- a/downloads/1612495889126.jpg
+++ b/downloads/1612495889126.jpg
--- a/downloads/1612496273920.jpg
+++ b/downloads/1612496273920.jpg
--- a/downloads/1612496326357.png
+++ b/downloads/1612496326357.png
--- a/downloads/1612496427354.png
+++ b/downloads/1612496427354.png
--- a/downloads/1612496458053.jpg
+++ b/downloads/1612496458053.jpg
--- a/downloads/1612496491214.png
+++ b/downloads/1612496491214.png
--- a/downloads/1612496545051.jpg
+++ b/downloads/1612496545051.jpg
--- a/downloads/1612496585654.jpg
+++ b/downloads/1612496585654.jpg
--- a/downloads/1612496617905.jpg
+++ b/downloads/1612496617905.jpg
--- a/downloads/1612498171829.png
+++ b/downloads/1612498171829.png
--- a/downloads/1612498234647.jpg
+++ b/downloads/1612498234647.jpg
--- a/downloads/1612498296481.jpg
+++ b/downloads/1612498296481.jpg
--- a/downloads/1612498357605.jpg
+++ b/downloads/1612498357605.jpg
--- a/downloads/1612498614377.jpg
+++ b/downloads/1612498614377.jpg
--- a/downloads/1612498872184.png
+++ b/downloads/1612498872184.png
--- a/downloads/1612498944896.jpg
+++ b/downloads/1612498944896.jpg
--- a/downloads/1612499007394.jpg
+++ b/downloads/1612499007394.jpg
--- a/downloads/1612499068840.png
+++ b/downloads/1612499068840.png
--- a/downloads/1612499129413.jpg
+++ b/downloads/1612499129413.jpg
--- a/downloads/1612500378091.jpg
+++ b/downloads/1612500378091.jpg
--- a/downloads/1612501855596.jpg
+++ b/downloads/1612501855596.jpg
--- a/downloads/1612542694607.jpg
+++ b/downloads/1612542694607.jpg
--- a/downloads/1612542757961.jpg
+++ b/downloads/1612542757961.jpg
--- a/downloads/1612542819567.jpg
+++ b/downloads/1612542819567.jpg
--- a/downloads/1612542880894.jpg
+++ b/downloads/1612542880894.jpg
--- a/downloads/1612542942459.jpg
+++ b/downloads/1612542942459.jpg
--- a/downloads/1612546870980.png
+++ b/downloads/1612546870980.png
--- a/downloads/1612549708543.jpg
+++ b/downloads/1612549708543.jpg
--- a/downloads/1612549769793.jpg
+++ b/downloads/1612549769793.jpg
--- a/downloads/1612587310966.jpg
+++ b/downloads/1612587310966.jpg
--- a/downloads/1612587373905.jpg
+++ b/downloads/1612587373905.jpg
--- a/downloads/1612587435764.jpg
+++ b/downloads/1612587435764.jpg
--- a/downloads/1612587498221.jpg
+++ b/downloads/1612587498221.jpg
--- a/downloads/1612660763418.jpg
+++ b/downloads/1612660763418.jpg
--- a/downloads/1612660825853.jpg
+++ b/downloads/1612660825853.jpg
--- a/downloads/1612679388338.png
+++ b/downloads/1612679388338.png
--- a/downloads/1612721086476.jpg
+++ b/downloads/1612721086476.jpg
--- a/downloads/1612721838882.png
+++ b/downloads/1612721838882.png
--- a/downloads/1612721863881.jpg
+++ b/downloads/1612721863881.jpg
--- a/downloads/1612721920904.jpg
+++ b/downloads/1612721920904.jpg
--- a/downloads/1612722054983.png
+++ b/downloads/1612722054983.png
--- a/downloads/1612722082481.png
+++ b/downloads/1612722082481.png
--- a/downloads/1612722415043.jpg
+++ b/downloads/1612722415043.jpg
--- a/downloads/1612722444002.jpg
+++ b/downloads/1612722444002.jpg
--- a/downloads/1612722505368.jpg
+++ b/downloads/1612722505368.jpg
--- a/downloads/1612722858144.gif
+++ b/downloads/1612722858144.gif
--- a/downloads/1612723284202.jpg
+++ b/downloads/1612723284202.jpg
--- a/downloads/1612723314446.jpg
+++ b/downloads/1612723314446.jpg
--- a/downloads/1612723453700.jpg
+++ b/downloads/1612723453700.jpg
--- a/downloads/1612723516899.jpg
+++ b/downloads/1612723516899.jpg
--- a/downloads/1612723578590.jpg
+++ b/downloads/1612723578590.jpg
--- a/downloads/1612723639738.jpg
+++ b/downloads/1612723639738.jpg
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+#coding: utf8
+
+#Scraper
+from bs4 import BeautifulSoup
+#Connexion web
+import mechanicalsoup
+import wget
+
+'''
+############## image scrapper ################
+Give me a url, i'll download all of its pictures !
+'''
+
+
+def getArgs():
+    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
+    No args
+    Returns:
+    -args : an args object containing all the optional arguments passed to the script.
+    '''
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-u", "--url", help = "URL of the thread", action="store", type=str, required=True)
+    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go", action="store", type=str, required=True)
+    #Creating the args object
+    args=parser.parse_args()
+
+    return args
+
+
+def html_get(url):
+    '''
+    Get html from the webpage
+    Args:
+    - url : a str containing url to scrap
+    Returns:
+    - page.soup: A BeautifulSoup object containing html
+    '''
+    #Browser
+    browser = mechanicalsoup.Browser()
+    try:
+        page = browser.get(url)
+        print(f"Got html from {url}")
+        return page.soup
+    except Exception as e:
+        print(f"Got error {e}")
+
+
+def scraper(soup):
+    '''
+    Scrape a bs4 html object, find posts w/ images and get full size source
+    Args:
+    - soup : bs4 soup item
+    - item : tag to find
+    Returns:
+    - sources : A list of image sources
+    '''
+    tags=soup.findAll('img')
+    sources = []
+    for tag in tags:
+        sources.append(tag['href'])
+
+    #Got image sources, removing any left http headers
+    for index, item in enumerate(sources):
+        sources[index] = item.replace("http://", "")
+        sources[index] = item.replace("https://", "")
+        sources[index] = item.replace("//", "")
+    print(f"{len(sources)} images found")
+    return sources
+
+
+def item_dl(sources, dlfolder):
+    '''
+    Download all items in the sources list to folder dlfolder"
+    Args:
+    - sources : a list of URLs
+    '''
+    for source in sources:
+        fullsource = "http://" + source
+        name = wget.download(fullsource, out=dlfolder)
+        print(f"{name} downloaded")
+
+
+args = getArgs()
+folder = args.folder
+url = args.url
+#url = "https://boards.4chan.org/b/"
+#url = 'https://www.deviantart.com/'
+soup = html_get(url)
+sources = scraper(soup)
+item_dl(sources, folder)