threadfinder

2021-02-07 22:46:38 +01:00
parent 6d08071c6c
commit bd16dc7a6e
3 changed files with 166 additions and 2 deletions
--- a/4chanthreadfinder.py
+++ b/4chanthreadfinder.py
@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+#coding: utf8
+
+#Scraper
+from bs4 import BeautifulSoup
+#Connexion web
+import mechanicalsoup
+import wget
+from os import mkdir, listdir
+from re import findall
+'''
+############## 4chan thread scrapper ################
+here we look for particular threads on 4chan and dl their images
+'''
+
+
+def getArgs():
+    '''Gets all the arguments passed to the script and returns them in a parse_args()-type object.
+    No args
+    Returns:
+    -args : an args object containing all the optional arguments passed to the script.
+    '''
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-u", "--url", help = "URL of the board. MUST INCLUDE FINAL /, ex : https://boards.4chan.org/b/", action="store", type=str, required=True)
+    parser.add_argument("-f", "--folder", help = "Folder in which downloads will go, ex: ./downloads", action="store", type=str, required=True)
+    parser.add_argument("-k", "--keyword", help = "keyword or phrase to look for in the threads, ex : 'cute thread'", action="store", type=str, required=True)
+
+    #Creating the args object
+    args=parser.parse_args()
+
+    return args
+
+def thread_finder(homepage, keyword):
+    '''
+    returns a list of all the threads where op said keyword on 4chan homepage
+    Args:
+    - homepage: bs4 soup object containing html from the homepage of the board
+    - keyword : any single word
+    Returns:
+    - hrefs : all the references to matching threads. They must be appended to homepage to work
+    '''
+    href = homepage.findAll('a', {'title': 'Reply to this post'})
+    threads = homepage.findAll('div', {'class': 'thread'})
+    hrefs = []
+
+    for thread in threads:
+        texts = thread.findAll('blockquote', {'class' : 'postMessage'})
+        for text in texts:
+            if keyword.lower() in text.text.lower():
+                links = thread.findAll('a', {'title': 'Reply to this post'})
+                for link in links:
+                    hrefs.append(f"{link['href']}")
+    return hrefs
+    
+
+
+
+def html_get(url):
+    '''
+    Get html from the webpage
+    Args:
+    - url : a str containing url to scrap
+    Returns:
+    - page.soup: A BeautifulSoup object containing html
+    '''
+    #Browser
+    browser = mechanicalsoup.Browser()
+    try:
+        page = browser.get(url)
+        print(f"Got html from {url}")
+        return page.soup
+    except Exception as e:
+        print(f"Got error {e}")
+
+
+def scraper(soup):
+    '''
+    Scrape a bs4 html object, find posts w/ images and get full size source
+    Args:
+    - soup : bs4 soup item
+    - item : tag to find
+    Returns:
+    - sources : A list of image sources
+    '''
+    tags=soup.findAll('a', {"class": "fileThumb"})
+    sources = []
+    for tag in tags:
+        sources.append(tag['href'])
+
+    #Got image sources, removing any left http headers
+    for index, item in enumerate(sources):
+        sources[index] = item.replace("http://", "")
+        sources[index] = item.replace("https://", "")
+        sources[index] = item.replace("//", "")
+    return sources
+
+
+def item_dl(sources, dlfolder):
+    '''
+    Download all items in the sources list to folder dlfolder, which we try to create"
+    Args:
+    - sources : a list of URLs
+    '''
+    #Making folder
+    try:
+        mkdir(dlfolder)
+    except FileExistsError:
+        print(f"{dlfolder} already exists, not creating")
+
+    #Deduplicating
+    imagenames = []
+    dir_content = listdir(dlfolder)
+    for index,source in enumerate(sources):
+        imagename = findall(r"[^\/]*$", source)[0]
+        if imagename in dir_content:
+            sources.pop(index)
+            print(f"Found duplicate {imagename}")
+
+    for source in sources:
+        fullsource = "http://" + source
+        name = wget.download(fullsource, out=dlfolder)
+        print(f"{name} downloaded")
+
+
+#Getting main elements
+args = getArgs()
+folder = args.folder
+keyword = args.keyword
+url = args.url
+soup = html_get(url)
+hrefs = thread_finder(soup, keyword)
+sources = scraper(soup)
+#item_dl(sources, folder)
+
+#Dling all threads found
+
+for href in hrefs:
+    print(f"going after {url}{href}")
+    subsoup = html_get(f"{url}{href}")
+    subsources = scraper(subsoup)
+    print(subsources)
+    item_dl(subsources, folder)
--- a/README.md
+++ b/README.md
@ -1,9 +1,11 @@
 # Scrappers

 Two scrappers:
-* The 4chan one dls all images from a thread in best res
+* The 4chancrape one dls all images from a thread in best res
 * The other one simply looks for "img" in any given page and downloads images
+* 4chanthreadfinder looks for a keyword in thread names, and dls all images from relevant threads

+## 4chanscrape, imgscrape
 Install depedencies:

 ```
@ -18,3 +20,21 @@ Use:

 * -u : URL of the page
 * -f : folder where you want to download all pictures
+
+## 4chanthreadfinder
+Install depedencies:
+
+```
+python3 -m pip install beautifulsoup4 mechanicalsoup wget --user
+```
+
+Use:
+
+```
+./4chanthreadfinder.py -u https://boards.4chan.org/b/ -f ./downloads/thread -k 'ylyl thread'
+```
+
+* -u : URL of the page
+* -f : folder where you want to download all pictures
+* -k : keyword or keyphrase to search (better use a single word !)
+
--- a/scrape.py
+++ b/scrape.py
@ -60,7 +60,7 @@ def scraper(soup):
    tags=soup.findAll('img')
    sources = []
    for tag in tags:
-        sources.append(tag['href'])
+        sources.append(tag['src'])

    #Got image sources, removing any left http headers
    for index, item in enumerate(sources):