Browse Source

added functionality to filter by a specific user

master
= 1 month ago
parent
commit
610c9e986c
9 changed files with 58 additions and 18 deletions
  1. +3
    -1
      README.md
  2. +8
    -3
      scraper/collectPosts.py
  3. BIN
      scraper/createOutput/__pycache__/__init__.cpython-37.pyc
  4. BIN
      scraper/createOutput/__pycache__/jsonConvert.cpython-37.pyc
  5. BIN
      scraper/createOutput/__pycache__/outputPrep.cpython-37.pyc
  6. BIN
      scraper/createOutput/__pycache__/textConvert.cpython-37.pyc
  7. +17
    -1
      scraper/funcs.py
  8. +20
    -8
      scraper/postData.py
  9. +10
    -5
      scraper/scraper.py

+ 3
- 1
README.md View File

@@ -1,6 +1,6 @@
# Kiwi Scraper

Python-based webscraper designed to scrape kiwifarms.net threads and filter them according to post ratings.
Python-based webscraper designed to scrape kiwifarms.net threads and filter them according to post ratings or user.

This program allows users to filter by:
- positive ratings
@@ -9,6 +9,7 @@ This program allows users to filter by:
- overall ratings
- specific rating
- weighted score (positive ratgins have positive values, negative ratings have negative values)
- specific user

The program outputs its findings via either JSON, text file, or plain text.

@@ -58,6 +59,7 @@ How would you like the thread to be filtered?
(4) total ratings
(5) specific rating
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count)
(7) specific user

: 4
------------------------------------------


+ 8
- 3
scraper/collectPosts.py View File

@@ -26,14 +26,15 @@ Total Reactions: {post.totalScore} Weighted: {post.weightedScore} Positive: {p
reactionsCtr = reactionsCtr + reaction + ": " + str(num) + " "
print(reactionsCtr)

def collectPosts(pageData, filter, minRating, pageNum, stopPage = 0, ratingFilter = ""):
def collectPosts(pageData, filter, pageNum, minRating = 0, user = "", stopPage = 0, ratingFilter = ""):
#scrapes for posts that pass the specified filter
#keeps going until the last page in thread (when there is not "next" button to click) or stopPage reached
#
# pageData = BeautifulSoup object representing first page to search
# filter = filter being used (from Filter class in scraper.py)
# minRating = minimum rating needed to pass filter
# pageNum = current page in thread
# minRating = minimum rating needed to pass filter if filtering by ratings
# user = user to filter by if filtering by user
# stopPage = page to stop at; 0 if none specified
# ratingFilter = rating to filter by if filtering by specific rating
#
@@ -54,7 +55,10 @@ def collectPosts(pageData, filter, minRating, pageNum, stopPage = 0, ratingFilte
ctr += 1
#print(ctr) #I use this for debugging, just ignore it.
indivPost = PostData()
indivPost.reactionData(post) #collect reactions
#collect reactions if filtering by reactions
if not filter.value == 7: indivPost.reactionData(post)
#collect username if filtering by user
else: indivPost.getUsername(post)

if (#check if post passes filter
(filter.value == 1 and indivPost.positive >= minRating) #positive
@@ -63,6 +67,7 @@ def collectPosts(pageData, filter, minRating, pageNum, stopPage = 0, ratingFilte
or (filter.value == 4 and indivPost.totalScore >= minRating) #total ratings
or (filter.value == 6 and indivPost.weightedScore >= minRating) #weighted score
or (filter.value == 5 and indivPost.ratings[ratingFilter] >= minRating) #specific rating
or (filter.value == 7 and indivPost.poster == user) #user
):#if filter passed, save rest of post's data, add to savedPosts, and print that the post was found
indivPost.takeInfo(post)
savedPosts.append(indivPost)


BIN
scraper/createOutput/__pycache__/__init__.cpython-37.pyc View File


BIN
scraper/createOutput/__pycache__/jsonConvert.cpython-37.pyc View File


BIN
scraper/createOutput/__pycache__/outputPrep.cpython-37.pyc View File


BIN
scraper/createOutput/__pycache__/textConvert.cpython-37.pyc View File


+ 17
- 1
scraper/funcs.py View File

@@ -2,6 +2,8 @@
from bs4 import BeautifulSoup
import requests

############# localy used functions ###########################

def collectInput(range):
#collects user input for multiple choice questions, validates it, and returns response
#
@@ -26,6 +28,8 @@ def yesno(question):
else:
return False

##################### functions used outside file ########################

def getThread():
#creates BeautifulSoup object from threadLink and retrieves thread's title and URL
# returns BS object, thread title, and link to thread
@@ -80,8 +84,9 @@ How would you like the thread to be filtered?
(4) total ratings
(5) specific rating
(6) weighted score (positive ratings count as positive points, negative ratings count as negative points, and neutral ratings don't count)
(7) specific user
""")
return collectInput(6)
return collectInput(7)


def reactionSelect():
@@ -108,6 +113,17 @@ Which rating do you want filtered?
"Horrifying", "Optimistic", "TMI", "Late", "Dumb", "Mad at the Internet", "Semper Fidelis", "Deviant", "Achievement", "DRINK!"][choice - 1]


def getUsername():
#collects username from user input

user = input("Enter username to filter by: ")#user input for name to filter by
#verify choice with user
if yesno(f"User '{user}' selected. Is this correct? (Make sure you spelled it correctly!)\n: ") == False:
return getUsername()

return user


def getRatingsThreshold(filter, reaction = ""):
#collects ratings threshold based on filter value
#


+ 20
- 8
scraper/postData.py View File

@@ -82,9 +82,10 @@ def getRatings(post):
class PostData:
#stores data about posts
reactionsEvaluated = False #true when the post's reactions have already been evaluated

def __init__(self):
self.reactionsEvaluated = False #true when the post's reactions have already been evaluated
self.usernameRetrieved = False #true when the poster's name has already been retrieved

#metadata
self.postLink = "" #link to post
self.postNum = 0 #post's number in thread
@@ -161,6 +162,21 @@ class PostData:
reactionsEvaluated = True
return {"positive" : self.positive, "neutral" : self.neutral, "negative" : self.negative, "weighted score" : self.weightedScore, "total score" : self.totalScore}

def getUsername(self, infoBS):
#retrieves the username of the poster and saves it to self.poster
#returns usename retrieved from post
#
#useful for retrieving username of poster without having to save the rest of the post's data
global usernameRetrieved
try:
self.poster = infoBS.find("span", class_ = "username").text
except AttributeError:#if user is a guest, username data is stored in a different tag
self.poster = infoBS.find("a", class_ = "username").text
usernameRetrieved = True
return self.poster


def takeInfo(self, infoBS):
#intakes BeautifulSoup object representing the HTML for a post
#and sorts its data into the proper variables
@@ -168,10 +184,7 @@ class PostData:
#get metadata
self.postLink = f"http://kiwifarms.net" + infoBS.find("a", class_ = "u-concealed")["href"]
self.postNum = int("".join(re.split("\n|\t", infoBS.find("ul", class_ = "message-attribution-opposite--list").find_all("li")[-1].find("a").text)).split("#")[1].replace(',',''))
try:
self.poster = infoBS.find("span", class_ = "username").text
except AttributeError:#if user is a guest, username data is stored in a different tag
self.poster = infoBS.find("a", class_ = "username").text
if not self.usernameRetrieved: self.getUsername(infoBS)
self.postDate = infoBS.find("time", class_ = "u-dt")["data-date-string"]
try:
self.editDate = infoBS.find("div", class_ = "message-lastEdit")
@@ -184,6 +197,5 @@ class PostData:
self.content = getContent(infoBS)
self.attachedMedia = getAttachments(infoBS)
#get rating data
global reactionsEvaluated
if not reactionsEvaluated:
if not self.reactionsEvaluated:
self.reactionData(infoBS)

+ 10
- 5
scraper/scraper.py View File

@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
import enum
from postData import PostData
from prepOutput import outputSelect
from funcs import getThread, stop_page, getFilter, reactionSelect, getRatingsThreshold
from funcs import getThread, stop_page, getFilter, reactionSelect, getRatingsThreshold, getUsername
from collectPosts import collectPosts

class Filter(enum.Enum):#enumeration for type of filter
@@ -17,6 +17,7 @@ class Filter(enum.Enum):#enumeration for type of filter
total = 4 #total ratings
specific = 5 #specific rating
weighted = 6 #weighted rating
user = 7 #specific user
###### Variables ######
#thread data
@@ -25,6 +26,7 @@ threadTitle = ""#title of thread
pageData = ""#BeautifulSoup object storing threadpage's HTML
savedPosts = [] #stores PostData objects for posts to be saved
pageNum = 0 #current page number
usernameFilter = "" #username to filter by when applicable

#filter toggles
filterSelection = Filter.NA #type of filter being used
@@ -54,12 +56,15 @@ stopPage = stop_page()
filterSelection = Filter(getFilter())

if filterSelection == Filter.specific:#if filtering by specific rating ask which rating to use
ratingSelection = reactionSelect()
ratingSelection = reactionSelect()

#colect ratings threshold
#if filtering by specific user, collect username
if filterSelection == Filter.user: usernameFilter = getUsername()

#colect ratings threshold if user is filtering by ratings
print("------------------------------------------")
if filterSelection == Filter.specific: minRating = getRatingsThreshold(filterSelection, ratingSelection)
else: minRating = getRatingsThreshold(filterSelection)
elif not filterSelection == Filter.user: minRating = getRatingsThreshold(filterSelection)

#get current page number
try: pageNum = int(pageData.find("li", class_ = "pageNav-page--current").find("a").text)
@@ -67,7 +72,7 @@ except: pageNum = 1

#scrape for posts
print("\nGrab some popcorn, this might take a while...\n")
savedPosts = collectPosts(pageData = pageData, filter = filterSelection, minRating = minRating, pageNum = pageNum, stopPage = stopPage, ratingFilter=ratingSelection)
savedPosts = collectPosts(pageData = pageData, filter = filterSelection, pageNum = pageNum, minRating = minRating, user = usernameFilter, stopPage = stopPage, ratingFilter=ratingSelection)

# ask user where to save json file and what to name it
print("------------------------------------------")


Loading…
Cancel
Save