@ -0,0 +1,21 @@ | |||
This is free and unencumbered software released into the public domain. | |||
Anyone is free to copy, modify, publish, use, compile, sell, or distribute | |||
this software, either in source code form or as a compiled binary, for any | |||
purpose, commercial or non-commercial, and by any means. | |||
In jurisdictions that recognize copyright laws, the author or authors of this | |||
software dedicate any and all copyright interest in the software to the public | |||
domain. We make this dedication for the benefit of the public at large and | |||
to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment | |||
in perpetuity of all present and future rights to this software under copyright | |||
law. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS | |||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH | |||
THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
For more information, please refer to <http://unlicense.org/> |
@ -0,0 +1,90 @@ | |||
# Kiwi Scraper | |||
Python-based webscraper designed to scrape kiwifarms.net threads and filter them according to post ratings. | |||
This program allows users to filter by: | |||
- positive ratings | |||
- negative ratings | |||
- neutral ratings | |||
- overall ratings | |||
- specific rating | |||
- weighted score (positive ratgins have positive values, negative ratings have negative values) | |||
In its current state, this program outputs its findings in the form of a JSON file. | |||
Example: | |||
 | |||
## Required Packages | |||
This program requires the following packages: | |||
```bash | |||
pip install bs4 | |||
pip install requests | |||
``` | |||
## Usage | |||
Example run: | |||
``` | |||
Welcome to the Kiwi Scraper! | |||
Please provide the link to the thread you want analyzed below. | |||
Please note that this program will start searching at the first threadpage that you link to, so if you'd like the thread analyzed starting at the frst page, please link to the first page of the thread; otherwise, provide a link to the first page you want scraped. | |||
If the link that you provide is valid but not working, try using the .nl/.pl domains, as cloudflare might be blocking your request. | |||
: https://kiwifarms.net/threads/russell-greer-theofficialinstaofrussellgreer.30488/ | |||
Fetching thread... | |||
------------------------------------------ | |||
Thread: Russell Greer / @theofficialinstaofrussellgreer - Swift-Obsessed Sex Pest, Magical Star Buddy | |||
------------------------------------------ | |||
Would you like to stop at a certain page (y/n)?: y | |||
What page do you want to stop at?: 1 | |||
------------------------------------------ | |||
How would you like the thread to be filtered? | |||
(1) positive ratings | |||
(2) negative ratings | |||
(3) neutral ratings | |||
(4) total ratings | |||
(5) specific rating | |||
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count) | |||
: 4 | |||
------------------------------------------ | |||
Enter a minumum number of total ratings. | |||
: 20 | |||
Grab some popcorn, this might take a while... | |||
POST FOUND - User: Cryin RN | Date: May 1, 2017 | #1 | |||
Total Reactions: 178 Weighted: 167 Positive: 170 Negative: 3 Neutral: 3 | |||
Like: 16 Agree: 2 Winner: 44 Informative: 101 Feels: 1 Islamic Content: 2 Autistic: 3 Horrifying: 1 Semper Fidelis: 6 DRINK!: 2 | |||
POST FOUND - User: Kugelsak Kastengrus 6th | Date: May 1, 2017 | #2 | |||
Total Reactions: 38 Weighted: 38 Positive: 38 Negative: 0 | |||
Neutral: 0 | |||
Like: 2 Agree: 29 Informative: 4 Feels: 3 | |||
/you get the idea/ | |||
Scraping finished. | |||
Posts found: 15 | |||
------------------------------------------ | |||
Please enter directory to save output to. | |||
: /path/to/directory | |||
What would you like to name your JSON file? | |||
: filename | |||
Creating file... | |||
Successful. | |||
File saved to path\to\directory\filename.json | |||
``` | |||
## Contributing | |||
Pull requests are welcome. I'm always open for new ideas. |
@ -1,80 +0,0 @@ | |||
from jsonConvert import JsonConvert | |||
from postData import PostData | |||
import os | |||
def yesno_(): | |||
#collects and verifies yes/no input; returns true if yes, false if no | |||
choice = input("(y/n): ") | |||
while not(choice.lower() == "y" or choice.lower() == "n"): | |||
print("Invalid input. Try again.") | |||
choice = input("(y/n): ") | |||
if choice.lower() == "y": | |||
return True | |||
else: | |||
return False | |||
def createJson(posts): | |||
#creates json file using JsonConvert class | |||
#returns true if file successfully created, false if not | |||
# | |||
#posts = list of PostData objects | |||
path = "" #path to output file | |||
fileName = "" #name of json file | |||
jsonCreate = JsonConvert() | |||
print("Please enter directory to save output to.") | |||
path = input(": ") | |||
while not os.path.isdir(path):#validate path | |||
print("Invalid pathname. Try again.") | |||
path = input(": ") | |||
if len(path.split("/")) > 1: path = "\\".join(path.split("/")) #replace all '/' with '\' if '/' is used | |||
if path[-1] == "\\": path = path[:-1] #remove any trailing '\' | |||
print("What would you like to name your JSON file?") | |||
fileName = input(": ") | |||
while os.path.exists(path + "\\" + fileName + ".json"):#if file at path already exists, have user confirm decision | |||
print("A file with the same name already exists in the path that you specified. Would you like to replace it?") | |||
ui = yesno_() | |||
if ui: | |||
break | |||
else: | |||
print("Would you like to select a new directory?") | |||
ui = yesno_() | |||
if not ui: | |||
fileName = input("Enter a new file name: ") | |||
else: | |||
return createJson(posts) | |||
#add posts to jsonCreate | |||
for post in posts: | |||
jsonCreate.unpackPostData(post) | |||
#create json file | |||
print("\nCreating file...") | |||
if not jsonCreate.exportJson(path, fileName): | |||
#if program failed to create file, ask if uer wants to try | |||
# different dierectory, or give up | |||
print(""" | |||
Error:Failed to create file. | |||
What would you like to do? | |||
(1) try different drectory | |||
(2) try something else | |||
""") | |||
ui = input(": ") | |||
while not ui.isdigit() and not (0 < ui < 3): | |||
print("Error: Input must be a number between 1 and 2. Try again.") | |||
ui = input(": ") | |||
if int(ui) == 1:#try again with different path | |||
return createJson(posts) | |||
if int(ui) == 2:#give up and return false | |||
return False | |||
else:#json file successfully created | |||
print("Successful.") | |||
print(f"File saved to {path}\\{fileName}.json") | |||
return True |
@ -1,117 +0,0 @@ | |||
#functions used for creating json files for output to user | |||
#user will be able to choose what data goes into the jaon file | |||
""" | |||
json template: | |||
post # | |||
|- link | |||
| | |||
|- "metadata" | |||
| |- user | |||
| |- date | |||
| |- edit date | |||
| | |||
|- "content" | |||
| |- raw html | |||
| |- raw text | |||
| |- formated text with media/stuff | |||
| |- attachments | |||
| | |||
|- "ratings" | |||
| |- "specific" | |||
| | |- Like | |||
| | |- Dislike | |||
| | |- Agree | |||
| | |- Disagree | |||
| | |- Winner | |||
| | |- Informative | |||
| | |- Thunk-Provoking | |||
| | |- Feels | |||
| | |- Islamic Content | |||
| | |- Lunacy | |||
| | |- Autistic | |||
| | |- Horrifying | |||
| | |- Optimistic | |||
| | |- TMI | |||
| | |- Late | |||
| | |- Dumb | |||
| | |- Mad at the Internet | |||
| | |- Semper Fidelis | |||
| | |- Devient | |||
| | |- Achievement | |||
| |- positive | |||
| |- neutral | |||
| |- negative | |||
| |- weighted score | |||
| |- total ratings | |||
""" | |||
from postData import PostData | |||
import json | |||
class JsonConvert: | |||
def __init__(self): | |||
self.postData = {} #dictionary to be converted into json output | |||
def unpackPostData(self, pd): | |||
#unpacks PostData object (pd) into postData dictionary | |||
postNum = "#" + str(pd.postNum) #post number | |||
self.postData[postNum] = {} | |||
self.postData[postNum]["link"] = pd.postLink | |||
self.postData[postNum]["metadata"] = {} | |||
self.postData[postNum]["metadata"]["user"] = pd.poster | |||
self.postData[postNum]["metadata"]["date"] = pd.postDate | |||
self.postData[postNum]["metadata"]["edit date"] = pd.editDate | |||
self.postData[postNum]["content"] = {} | |||
self.postData[postNum]["content"]["raw html"] = [pd.rawDat] | |||
self.postData[postNum]["content"]["raw text"] = [pd.rawText] | |||
self.postData[postNum]["content"]["formated text"] = pd.content | |||
self.postData[postNum]["content"]["formated text"] = pd.attachedMedia | |||
self.postData[postNum]["ratings"] = {} | |||
self.postData[postNum]["ratings"]["specific ratings"] = {} | |||
self.postData[postNum]["ratings"]["specific ratings"]["Like"] = pd.ratings["Like"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Dislike"] = pd.ratings["Dislike"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Agree"] = pd.ratings["Agree"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Disagree"] = pd.ratings["Disagree"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Winner"] = pd.ratings["Winner"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Informative"] = pd.ratings["Informative"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Thunk-Provoking"] = pd.ratings["Thunk-Provoking"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Feels"] = pd.ratings["Feels"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Islamic Content"] = pd.ratings["Islamic Content"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Lunacy"] = pd.ratings["Lunacy"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Autistic"] = pd.ratings["Autistic"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Horrifying"] = pd.ratings["Horrifying"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Optimistic"] = pd.ratings["Optimistic"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["TMI"] = pd.ratings["TMI"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Late"] = pd.ratings["Late"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Dumb"] = pd.ratings["Dumb"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Mad at the Internet"] = pd.ratings["Mad at the Internet"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Semper Fidelis"] = pd.ratings["Semper Fidelis"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Deviant"] = pd.ratings["Deviant"] | |||
self.postData[postNum]["ratings"]["specific ratings"]["Achievement"] = pd.ratings["Achievement"] | |||
self.postData[postNum]["positive ratings"] = pd.positive | |||
self.postData[postNum]["negative ratings"] = pd.negative | |||
self.postData[postNum]["neutral ratings"] = pd.neutral | |||
self.postData[postNum]["weighted score"] = pd.weightedScore | |||
self.postData[postNum]["total ratings"] = pd.totalScore | |||
def exportJson(self, path, fileName): | |||
#creates json file from postData dictionary and saves it at path | |||
#returns true if success, false otherwise | |||
# | |||
#path = file location to save json to | |||
#filename = name of json file | |||
try: | |||
with open(path + "\\" + fileName + ".json", "w") as export: | |||
json.dump(self.postData, export) | |||
return True | |||
except: | |||
print(self.postData) | |||
return False | |||
@ -1,190 +0,0 @@ | |||
from bs4 import BeautifulSoup | |||
import requests | |||
import re | |||
def getText(post): | |||
#returns text from post | |||
return post.find("div", class_ = "bbWrapper").text | |||
def getContent(post): | |||
#TODO: add functionality for printing out post's text along with: | |||
# -links | |||
# -spoilers | |||
# -pictures | |||
# -reply quotes | |||
# -quoted text | |||
# -embeded video (youtube) | |||
# -video | |||
return "" | |||
def getAttachments(post): | |||
#returns links to any attatchments from post | |||
try: | |||
attachContainer = post.find("ul", class_ = "attachmentList") | |||
attachments_ = attachContainer.find_all("li", class_ = "attachment") | |||
attachments = [] | |||
for at in attachments_: | |||
at = at.find("a")['href'] | |||
at = "http://www.kiwifarms.net" + at | |||
attachments.append(at) | |||
return attachments | |||
except:#if no attachments were found, return an empty list | |||
return [] | |||
def getRatings(post, TLD): | |||
#returns a dictionary with all of a post's ratings | |||
try: | |||
reactionsLink = f"http://kiwifarms{TLD}" + post.find("a", class_ = "reactionsBar-link")["href"] | |||
while True: | |||
try: | |||
reactions = requests.get(reactionsLink).text | |||
break | |||
except:#if program is unable to reach reactions page, ask user if they want to skip or retry | |||
print(f"Error: Unable to retrieve reactions for post # " | |||
+ "".join(re.split("\n|\t", post.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[1].find("a").text))[1:] + ".") | |||
print("What would you like to do?") | |||
print("(1) retry (please wait a few minutes before you do this)") | |||
print("(2) skip") | |||
choice = input(": ") | |||
while not choice.isdigit() or not (0 < choice < 3): | |||
print("Error: Input must be a number between 1 and 2.") | |||
choice = input(": ") | |||
if choice == 1 : | |||
continue | |||
if choice == 2 : | |||
break | |||
reactions = BeautifulSoup(reactions, "lxml") | |||
reactions = reactions.find("span", class_ = "hScroller-scroll").find_all("a") | |||
reactionsFound = {} | |||
for r in reactions: | |||
try: | |||
foundReaction = r.find("span", class_ = "reaction-text").text | |||
foundReaction = foundReaction.split(" (") | |||
reactionsFound.update([(foundReaction[0], int(foundReaction[1][:-1]))]) | |||
except: | |||
continue | |||
return reactionsFound | |||
except: #if no reactions found, return an empty dictionary | |||
return {} | |||
class PostData: | |||
#stores data about posts | |||
reactionsEvaluated = False #true when the post's reactions have already been evaluated | |||
def __init__(self): | |||
#metadata | |||
self.postLink = "" #link to post | |||
self.postNum = 0 #post's number in thread | |||
self.poster = "" #user making post | |||
self.postDate = "" #time of post | |||
self.edited = False #true if post was edited | |||
self.editDate = "n/a" #date of last edit | |||
self.ratings = { #ratings recieved | |||
"Like" : 0, | |||
"Dislike" : 0, | |||
"Agree" : 0, | |||
"Disagree" : 0, | |||
"Winner" : 0, | |||
"Informative" : 0, | |||
"Thunk-Provoking" : 0, | |||
"Feels" : 0, | |||
"Islamic Content" : 0, | |||
"Lunacy" : 0, | |||
"Autistic" : 0, | |||
"Horrifying" : 0, | |||
"Optimistic" : 0, | |||
"TMI" : 0, | |||
"Late" : 0, | |||
"Dumb" : 0, | |||
"Mad at the Internet" : 0, | |||
"Semper Fidelis" : 0, | |||
"Deviant" : 0, | |||
"Achievement" : 0 | |||
} | |||
#content | |||
self.rawDat = "" #raw HTML data from post | |||
self.rawText = "" #text from post not including pictures/media | |||
self.content = "" #all text/media from post | |||
self.attachedMedia = [] #links to any attached media | |||
#rating data | |||
self.positive = 0 #positive ratings | |||
self.neutral = 0 #neurral ratings | |||
self.negative = 0 #negative ratings | |||
self.weightedScore = 0 #score accounting for positive and negitive ratings, where negatives count as negative points | |||
self.totalScore = 0 #total number of ratings recieved | |||
def addRatings(self, newRatings): | |||
#takes in newTatings from passed in dictionary and adjusts | |||
#both the ratings and ratings data accordingly | |||
pos = ["Like", "Agree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Semper Fidelis", "Achievement"] | |||
neu = ["Islamic Content", "Lunacy", "Horrifying", "Optimistic", "Devient"] | |||
neg = ["Dislike", "Disagree", "Autistic", "TMI", "Late", "Dumb", "Mad at the Internet"] | |||
#exit function if ratings is not in dictionary form | |||
if not type(newRatings) is dict: | |||
return | |||
#add each rating to ratings list if correctly formated | |||
for key, value in newRatings.items(): | |||
try: | |||
self.ratings[key] = value | |||
if key in pos: | |||
self.positive += value | |||
self.weightedScore += value | |||
elif key in neu: | |||
self.neutral += value | |||
elif key in neg: | |||
self.negative += value | |||
self.weightedScore -= value | |||
self.totalScore += value | |||
except: | |||
continue | |||
def reactionData(self, infoBS, TLD): | |||
#processes passed in post info from BeautifulSoup | |||
#and returns a dictionary with it's reactions data | |||
# | |||
#useful for checking reactions before saving the rest of a post's data | |||
global reactionsEvaluated | |||
self.addRatings(getRatings(infoBS, TLD)) | |||
reactionsEvaluated = True | |||
return {"positive" : self.positive, "neutral" : self.neutral, "negative" : self.negative, "weighted score" : self.weightedScore, "total score" : self.totalScore} | |||
def takeInfo(self, infoBS, TLD): | |||
#intakes BeautifulSoup object representing the HTML for a post | |||
#and sorts its data into the proper variables | |||
#get metadata | |||
self.postLink = f"http://kiwifarms{TLD}" + infoBS.find("a", class_ = "u-concealed")["href"] | |||
self.postNum = int("".join(re.split("\n|\t", infoBS.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text))[1:]) | |||
try: | |||
self.poster = infoBS.find("a", class_ = "username").text | |||
except AttributeError:#if user is a guest, username data is stored in a different tag | |||
self.poster = infoBS.find("span", class_ = "username").text | |||
self.postDate = infoBS.find("time", class_ = "u-dt").text | |||
try: | |||
self.editDate = infoBS.find("div", class_ = "message-lastEdit").find("time", class_ = "u-dt").text | |||
self.edited = True | |||
except: | |||
self.edited = False | |||
#get content | |||
self.rawDat = f"""{infoBS}""" | |||
self.rawText = getText(infoBS) | |||
self.content = getContent(infoBS) | |||
self.attachedMedia = getAttachments(infoBS) | |||
#get rating data | |||
global reactionsEvaluated | |||
if not reactionsEvaluated: | |||
self.reactionData(infoBS, TLD) |
@ -1 +0,0 @@ | |||
{"#1": {"link": "http://kiwifarms.net/threads/roy-dr-p-philipose.7986/post-566857", "metadata": {"user": "Atsimuel", "date": "Feb 24, 2015", "edit date": "n/a"}, "content": {"raw html": |
@ -1,236 +0,0 @@ | |||
#Scrapes kiwifarms.net threads for posts above a certain ratings threshold. | |||
#This filter can be set by the user, with several availible options. | |||
#REQUIRED PACKEGES: bs4, requests | |||
from bs4 import BeautifulSoup | |||
import requests | |||
from postData import PostData | |||
from createOutput import createJson | |||
import enum | |||
class Filter(enum.Enum):#enumeration for type of filter | |||
NA = 0 #not yet assigned | |||
positive = 1 #positive ratings | |||
negative = 2 #negative ratings | |||
neutral = 3 #neutral ratings | |||
total = 4 #total ratings | |||
specific = 5 #specific rating | |||
weighted = 6 #weighted rating | |||
def collectInput(range): | |||
#collects user input for multiple choice questions, validates it, and returns response | |||
# | |||
#range = upper range for input | |||
choice = input(": ") | |||
while not choice.isdigit() or not (0 < int(choice) <= range): | |||
print(f"Error: Input must be a number between 1 and {range}. Try again.") | |||
choice = input(": ") | |||
return int(choice) | |||
def yesno(question): | |||
#collects and verifies yes/no input; returns true if yes, false if no | |||
# | |||
#question = question to ask | |||
choice = input(question) | |||
while not(choice.lower() == "y" or choice.lower() == "n"): | |||
print("Invalid input. Try again.") | |||
choice = input(question) | |||
if choice.lower() == "y": | |||
return True | |||
else: | |||
return False | |||
def getTLD(url): | |||
#extracts TLD from url | |||
tld = url.split("/")[2][-3:] | |||
if tld == '.nl' or tld == '.pl': | |||
return tld | |||
else: | |||
return '.net' | |||
def printPost(post): | |||
#print basic info about post | |||
reactionsCtr = "" | |||
print(f""" | |||
POST FOUND - User: {post.poster} | Date: {post.postDate} | #{post.postNum} | |||
Total Reactions: {post.totalScore} Weighted: {post.weightedScore} Positive: {post.positive} Negative: {post.negative} Neutral: {post.neutral}""" | |||
) | |||
for reaction, num in post.ratings.items(): | |||
if num > 0: | |||
reactionsCtr = reactionsCtr + reaction + ": " + str(num) + " " | |||
print(reactionsCtr) | |||
###### Variables ###### | |||
#thread data | |||
threadLink = ""#link to thread | |||
threadTitle = ""#title of thread | |||
pageData = ""#BeautifulSoup object storing threadpage's HTML | |||
TLD = "" #top-level domain of URL, distinguishes wheather the .net/.pl/.nl domain is being used | |||
posts = [] #stores HTML for each post on a page | |||
indivPost = PostData() #PostData object for individual post | |||
savedPosts = [] #stores PostData objects for posts to be saved | |||
pageNum = 0 #current page number | |||
#filter toggles | |||
filterSelection = Filter.NA #type of filter being used | |||
RS_type = "" #specific reaction to sort by when sorting by specific reaction | |||
minRating = 0 #minimum number for ratings/score for filter | |||
stopPage = -1 #page number to stop searching at, -1 means no page specified | |||
#miscelaneous | |||
choice = 0 #used for storing user multiple-choice input | |||
ui = "" #stores user input of other types | |||
ctr = 0 #counter variable | |||
terminate = False #used for terminating scraping incase of error | |||
###################### | |||
#collect link to thread | |||
print("Welcome to the Kiwi Scraper!\n") | |||
print("Please provide the link to the thread you want analyzed below.") | |||
print("\nPlease note that this program will start searching at the first threadpage that you link to,", | |||
"so if you'd like the thread analyzed starting at the frst page, please link to the first page of the thread; otherwise,", | |||
"provide a link to the first page you want scraped.") | |||
print("\nIf the link that you provide is valid but not working, try using the .nl/.pl domains, as", | |||
"cloudflare might be blocking your request.") | |||
while True:#make sure provided link is valid by checking for a thread title | |||
try: | |||
threadLink = input("\n: ") | |||
print("\nFetching thread...") | |||
pageData = requests.get(threadLink, headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) | |||
pageData = BeautifulSoup(pageData.text, "lxml") | |||
threadTitle = pageData.find("h1", class_ = "p-title-value").text | |||
if not pageData.find("span", class_ = "label") == None:#remove label if it exists | |||
threadTitle = " ".join(threadTitle.split("\xa0")[1:]) | |||
break | |||
except: | |||
print("Error: There was either trouble reaching the webpage, or the provided link is invalid.\nTry again.") | |||
TLD = getTLD(threadLink) | |||
print("------------------------------------------\nThread:", threadTitle) | |||
print("------------------------------------------") | |||
#check if user would like to stop at a certain page | |||
ui = yesno("Would you like to stop at a certain page (y/n)?: ") | |||
if ui: | |||
stopPage = input("What page do you want to stop at?: ") | |||
while not stopPage.isdigit(): | |||
print("Input must be a number. Try again.") | |||
stopPage = input("What page do you want to stop at?: ") | |||
stopPage = int(stopPage) | |||
#collect user peference for filtering | |||
#collect filter type | |||
print("""------------------------------------------ | |||
How would you like the thread to be filtered? | |||
(1) positive ratings | |||
(2) negative ratings | |||
(3) neutral ratings | |||
(4) total ratings | |||
(5) specific rating | |||
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count) | |||
""") | |||
filterSelection = Filter(collectInput(6)) | |||
if filterSelection == Filter.specific:#if filtering by specific rating ask which rating to use | |||
print("""------------------------------------------ | |||
Which rating do you want filtered? | |||
(1) Like (2) Dislike | |||
(3) Agree (4) Disagree | |||
(5) Winner (6) Informative | |||
(7) Thunk-Provoking (8) Feels | |||
(9) Islamic Content (10) Lunacy | |||
(11) Autistic (12) Horrifying | |||
(13) Optimistic (14) TMI | |||
(15) Late (16) Dumb | |||
(17) Mad at the Internet (18) Semper Fidelis | |||
(19) Deviant (20) Achievement | |||
""") | |||
choice = collectInput(20) | |||
RS_type = ["Like", "Dislike", "Agree", "Disagree", "Winner", "Informative", "Thunk-Provoking", "Feels", "Islamic Content", "Lunacy", "Autistic", | |||
"Horrifying", "Optimistic", "TMI", "Late", "Dumb", "Mad at the Internet", "Semper Fidelis", "Deviant", "Achievement"][choice - 1] | |||
#colect ratings threshold | |||
print("------------------------------------------") | |||
if filterSelection == Filter.weighted: | |||
print("Enter a minimum score for posts.") | |||
elif filterSelection == Filter.specific: | |||
print(f"Enter a minumum number of \'{RS_type}\' ratings.") | |||
else: | |||
print(f"Enter a minumum number of {filterSelection.name} ratings.") | |||
minRating = input(": ") | |||
while not minRating.isdigit(): | |||
print("Error: Input must be a number.") | |||
minRating = input(": ") | |||
minRating = int(minRating) | |||
#get current page number | |||
pageNum = int(pageData.find("li", class_ = "pageNav-page--current").find("a").text) | |||
#scrape for posts | |||
#as long as a next-page button exists, scraper will keep searching through pages | |||
print("\nGrab some popcorn, this might take a while...\n") | |||
ctr = 0 | |||
while True: | |||
if terminate: | |||
break | |||
posts = pageData.find_all("article", class_ = "message") | |||
for post in posts: | |||
ctr += 1 | |||
#print(ctr) #I use this for debugging, just ignore it. | |||
indivPost = PostData() | |||
indivPost.reactionData(post, TLD) | |||
if ( | |||
(filterSelection == Filter.positive and indivPost.positive >= minRating) | |||
or (filterSelection == Filter.negative and indivPost.negative >= minRating) | |||
or (filterSelection == Filter.neutral and indivPost.neutral >= minRating) | |||
or (filterSelection == Filter.total and indivPost.totalScore >= minRating) | |||
or (filterSelection == Filter.weighted and indivPost.weightedScore >= minRating) | |||
or (filterSelection == Filter.specific and indivPost.ratings[RS_type] >= minRating) | |||
): | |||
indivPost.takeInfo(post, TLD) | |||
savedPosts.append(indivPost) | |||
printPost(indivPost) | |||
if pageNum == stopPage:#stop scraping when user's stop-page reached | |||
break | |||
elif not pageData.find("a", class_ = "pageNav-jump--next") == None:#stop scraping when program can't progress to another page | |||
while True: | |||
if terminate: | |||
break | |||
try: | |||
pageData = requests.get(f"http://kiwifarms{TLD}" + pageData.find("a", class_ = "pageNav-jump--next")['href']) | |||
pageData = BeautifulSoup(pageData.text, 'lxml') | |||
break | |||
except:#this runs when connection can't be made to the next page | |||
print(f""" | |||
Error: Connection timed out while trying to connect to page {pageData.find("a", class_ = "pageNav-jump--next")['href'].split('-')[-1]}. | |||
What would you like to do? | |||
(1) retry (please wait a few minutes before you do this) | |||
(2) save what you have already | |||
(3) quit | |||
""") | |||
choice = collectInput(3) | |||
if choice == 1: | |||
continue | |||
elif choice == 2: | |||
terminate = True | |||
continue | |||
else: | |||
quit() | |||
pageNum += 1 | |||
else: | |||
break | |||
print(f"\nScraping finished.\nPosts found: {len(savedPosts)}") | |||
# ask user where to save json file and what to name it | |||
print("------------------------------------------") | |||
createJson(savedPosts) |