Some Kiwi Farms analytics using Common Lisp!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

494 lines
19 KiB

  1. ;;;; Some Kiwi Farms analytics using Common Lisp!
  2. ;;;;
  3. ;;;; (more to come here. Also for the markdown readme.)
  4. ;;; Load the necessary libraries (maybe packagedef at some point?)
  5. ;;; Also, might want to quickload these with quicklisp, in case they
  6. ;;; aren't there
  7. ;;; Extra types for CLOS
  8. (require :trivial-types)
  9. (use-package :trivial-types)
  10. ;;; Net/HTML parsing
  11. (require 'dexador)
  12. (require 'lquery)
  13. ;;; Regular expressions
  14. (require 'cl-ppcre)
  15. ;;; Pretty pie charts etc with adw-charting + Vecto backend
  16. ;;; (seems like it will be useful for metrics)
  17. (require 'adw-charting)
  18. (require 'vecto)
  19. (asdf:oos 'asdf:load-op 'adw-charting-vecto)
  20. ;;; A class to represent a Kiwi Farms post.
  21. (defclass post ()
  22. ((author :reader get-post-author
  23. :initarg :author
  24. :type string
  25. :documentation "The author of this post.")
  26. (body :reader get-post-body
  27. :initarg :body
  28. :type string
  29. :documentation "The HTML body of this post.")
  30. (edit-time :reader get-post-edit-time
  31. :initarg :edit-time
  32. :type string
  33. :documentation "The time this post was edited (NIL if never).")
  34. (global-number :reader get-post-global-number
  35. :initarg :global-number
  36. :type integer
  37. :documentation "The post number (with respect to the forum).")
  38. (highlightp :reader get-post-highlightp
  39. :initarg :highlightp
  40. :type boolean
  41. :documentation "True (T) if this post is a highlight. (T/NIL).")
  42. (number :reader get-post-number
  43. :initarg :number
  44. :type integer
  45. :documentation "The post number (with respect to the thread).")
  46. (ratings :reader get-post-ratings
  47. :initarg :ratings
  48. :type property-list
  49. :documentation "This post's ratings, as a plist.")
  50. (time :reader get-post-time
  51. :initarg :time
  52. :type string
  53. :documentation "The time this post was made.")
  54. (url :reader get-post-url
  55. :initarg :url
  56. :type string
  57. :documentation "The URL for this post."))
  58. (:documentation "A post in a Kiwi Farms thread."))
  59. (defmethod print-object ((obj post) stream)
  60. "Pretty printing for post objects in the REPL."
  61. (print-unreadable-object (obj stream :type t)
  62. (format stream
  63. "#~4,'0d: ~a, ~a"
  64. (get-post-number obj)
  65. (get-post-author obj)
  66. (get-post-time obj))))
  67. (defmethod print-post-listing ((obj post) &optional bodyp)
  68. "Print a pretty post listing. If BODYP, the post body is also printed."
  69. (format t
  70. (concatenate 'string
  71. "~a~%#~4,'0d (#~7,'0d): ~a~%"
  72. "Posted: ~a~%Edited: ~a~%Ratings: ~a~%"
  73. "Highlight: ~a~%")
  74. (get-post-url obj)
  75. (get-post-number obj)
  76. (get-post-global-number obj)
  77. (get-post-author obj)
  78. (get-post-time obj)
  79. (get-post-edit-time obj)
  80. (get-post-ratings obj)
  81. (get-post-highlightp obj))
  82. (when bodyp
  83. (format t "~a~%" (get-post-body obj)))
  84. (format t "~%"))
  85. ;;; TODO: possibly want a 'to-serialized' method that prints a readable sexpr
  86. ;;; form of the posts so we can potentially store them and not have to
  87. ;;; spend so much time re-downloading them each time?
  88. ;;; (cont.) if so, then this function should be 'build-post-from-dom',
  89. ;;; and have another function 'build-post-from-serialized'.
  90. (defun build-post (dom-element)
  91. "Build a post object given the (plump) DOM article element for the post."
  92. (make-instance 'post
  93. :author (extract-author dom-element)
  94. :body (extract-body dom-element)
  95. :edit-time (extract-edit-time dom-element)
  96. :highlightp (extract-highlightp dom-element)
  97. :global-number (extract-global-number dom-element)
  98. :number (extract-number dom-element)
  99. :ratings (extract-ratings dom-element)
  100. :time (extract-time dom-element)
  101. :url (extract-url dom-element)))
  102. (defun extract-author (dom-element)
  103. "Extract the post author text from HTML/DOM."
  104. (elt (lquery:$ dom-element (attr :data-author)) 0))
  105. (defun extract-body (dom-element)
  106. "Extract the post body from HTML/DOM."
  107. (elt (lquery:$ dom-element "div article div" (html)) 0))
  108. (defun extract-edit-time (dom-element)
  109. "Extract the post's edit time from HTML/DOM. (Returns NIL if not edited.)"
  110. (if (> (length (lquery:$ dom-element "time" (attr :title))) 1)
  111. (elt (lquery:$ dom-element "time" (attr :title)) 1)
  112. nil))
  113. (defun extract-highlightp (dom-element)
  114. "Extract this post's highlight status from HTML/DOM. (T/NIL.)"
  115. (> (length (lquery:$ dom-element "header ul li span" (text))) 0))
  116. (defun extract-global-number (dom-element)
  117. "Extract the forum-global post number from HTML/DOM."
  118. (parse-integer (elt (lquery:$ dom-element (attr :data-content)) 0)
  119. :start (length "post-")))
  120. (defun extract-number (dom-element)
  121. "Extract the post number from HTML/DOM."
  122. ;; TODO: Need to fix this up! The highlight specifier messes with it.
  123. (parse-integer
  124. (cl-ppcre:scan-to-strings "(\\d+)"
  125. (reduce (lambda (s1 s2)
  126. (concatenate 'string s1 s2))
  127. (lquery:$ dom-element "li a" (text))))))
  128. (defun extract-ratings (dom-element)
  129. "Extract the post ratings from HTML/DOM. (Requires another HTTP call.)"
  130. (let* ((url (format nil
  131. "~a~d~a"
  132. "https://kiwifarms.net/posts/"
  133. (extract-global-number dom-element)
  134. "/reactions"))
  135. (page (progn (format t "Downloading: ~a~%" url)
  136. (dex:get url)))
  137. (plump (lquery:$ (initialize page)))
  138. (ratings (lquery:$ plump "bdi" (parent)))
  139. (ratings-plist nil))
  140. ;; We ignore the first element of ratings (for now?); it's the 'All' count
  141. (loop
  142. for index from 1 below (length ratings)
  143. for rating = (elt ratings index)
  144. for rating-label = (elt (lquery:$ rating "bdi" (text)) 0)
  145. for rating-count = (parse-integer
  146. (cl-ppcre:scan-to-strings
  147. "(\\d+)"
  148. (elt (lquery:$ rating (text)) 0)))
  149. do (setf (getf ratings-plist (intern (string rating-label) :keyword))
  150. rating-count))
  151. ratings-plist))
  152. (defun extract-time (dom-element)
  153. "Extract the post's write time from HTML/DOM."
  154. (elt (lquery:$ dom-element "time" (attr :title)) 0))
  155. ;;; TODO: fix this so it doesn't depend on the thread url?
  156. ;;; (is that info available in the given HTML?)
  157. (defun extract-url (dom-element)
  158. "Extract the post's URL from HTML/DOM."
  159. (concatenate 'string
  160. *thread-url*
  161. (elt (lquery:$ dom-element (attr :data-content)) 0)))
  162. ;;; Script stuff (maybe want this in a separate file/it's own function?)
  163. ;;; The URL for the Kiwi Farms thread (e.g.)
  164. ;;; (read this in from ARGV serialization at some point? TODO.)
  165. (defparameter *thread-url* "https://kiwifarms.net/threads/sam-fennah-failsafe42.64413/")
  166. ;;; First things first! User passes in a thread url, we want to get all
  167. ;;; pages as URL strings in a list, probably. (maybe refer to the sitemap for
  168. ;;; this?)
  169. ;;; Alternatively, maybe the user specifies these? (so that we don't rip entire
  170. ;;; threads but only specified thread pages, seems more manageable.)
  171. ;;;Also, echo everything as we go!
  172. (defparameter *start-page* 1)
  173. ;;;upto whatever page
  174. (defparameter *end-page* 30)
  175. ;;; Collect the specified thread posts into a list
  176. (defparameter *collected-posts*
  177. (loop
  178. for page-index from *start-page* to *end-page*
  179. for url = (format nil "~Apage-~D" *thread-url* page-index)
  180. for post = (progn (format t "Downloading: ~a~%" url)
  181. (dex:get url))
  182. for plump = (lquery:$ (initialize post))
  183. append (loop
  184. for article across (lquery:$ plump "article[data-author]")
  185. collect (build-post article))))
  186. ;;; Can print them all to check
  187. (mapcar #'print-post-listing *collected-posts*)
  188. ;;; Make a backup copy in case I bork the current one
  189. (defparameter *backup-posts-list* (copy-seq *collected-posts*))
  190. ;;; For interest: how much memory are we using with all this?
  191. (room)
  192. ;;; Yeah, probably want to get serialization or something sorted
  193. ;;; before I point this at one of the 2000+ page threads.
  194. ;;; Useful stuff we want to be able to do!
  195. ;;; e.g. sort posts by most Informative
  196. (defun get-rating-count (post rating)
  197. "Return the number of RATING ratings that the POST received."
  198. (let* ((ratings (get-post-ratings post))
  199. (rating-count (getf ratings (intern (string rating) :keyword))))
  200. (unless rating-count
  201. (setf rating-count 0))
  202. rating-count))
  203. (defun sort-posts-by-rating (predicate rating)
  204. "Sort the list of posts by RATING according to the given PREDICATE."
  205. (setf *collected-posts*
  206. (stable-sort *collected-posts*
  207. predicate
  208. :key (lambda (post)
  209. (get-rating-count post rating)))))
  210. (defun sort-posts-by-most (rating)
  211. "Sort the list of posts in descending order of RATING."
  212. (sort-posts-by-rating #'> rating))
  213. (defun sort-posts-by-least (rating)
  214. "Sort the list of posts in ascending order of RATING."
  215. (sort-posts-by-rating #'< rating))
  216. (sort-posts-by-most "Informative")
  217. (mapcar #'print-post-listing *collected-posts*)
  218. ;;; For time comparisons, we convert the string timestamp into
  219. ;;; CL Universal time. This will be useful for plotting too, I imagine.
  220. (defun timestamp-to-universal-time (timestamp)
  221. "Convert a given post TIMESTAMP to CL universal time."
  222. ;; First, we have some helper functions to parse the timestamp properly.
  223. (flet ((12hour-to-24hour (hours pm-p)
  224. "Convert the given 12-hour time to 24-hour time."
  225. (cond ((and (= hours 12) (not pm-p)) 0)
  226. ((and (/= hours 12) (not pm-p)) hours)
  227. ((and (= hours 12) pm-p) hours)
  228. (t (+ 12 hours))))
  229. (month-string-to-number (month)
  230. "Converts a month string (e.g. 'Dec') to a number (e.g. 12)"
  231. (let ((months '("Jan" "Feb" "Mar" "Apr" "May" "Jun"
  232. "Jul" "Aug" "Sep" "Oct" "Nov" "Dec")))
  233. (1+ (position month months :test #'string=)))))
  234. (let* ((parts (cl-ppcre:split "(\\s+)|(,)|(:)" timestamp))
  235. (seconds 0) ; Do we care about seconds?
  236. (minutes (parse-integer (elt parts 6)))
  237. (hours (12hour-to-24hour
  238. (parse-integer (elt parts 5))
  239. (string= "PM" (elt parts 7))))
  240. (day (parse-integer (elt parts 1)))
  241. (month (month-string-to-number (elt parts 0)))
  242. (year (parse-integer (elt parts 3))))
  243. (encode-universal-time seconds minutes hours day month year))))
  244. ;;; And back the other way!
  245. (defun universal-time-to-timestamp (time)
  246. "Convert the given CL universal TIME into a formatted timestamp."
  247. (flet ((24hour-to-12hour (hours)
  248. "Convert the given 24-hour time to a 12-hour time."
  249. (let ((pm-p (>= hours 12)))
  250. (values (cond ((= hours 0) 12)
  251. ((not pm-p) hours)
  252. ((= hours 12) hours)
  253. (t (- hours 12)))
  254. pm-p)))
  255. (month-number-to-string (month)
  256. "Converts a month number (e.g. 12) to a string (e.g. 'Dec')"
  257. (let ((months '("Jan" "Feb" "Mar" "Apr" "May" "Jun"
  258. "Jul" "Aug" "Sep" "Oct" "Nov" "Dec")))
  259. (elt months (1- month)))))
  260. (multiple-value-bind (seconds minutes 24hours day month year)
  261. (decode-universal-time time)
  262. (declare (ignore seconds)) ; We still don't care about seconds?
  263. (multiple-value-bind (12hours pm-p)
  264. (24hour-to-12hour 24hours)
  265. (format nil
  266. "~a ~d, ~d at ~d:~2,'0d ~a"
  267. (month-number-to-string month)
  268. day
  269. year
  270. 12hours
  271. minutes
  272. (if pm-p "PM" "AM"))))))
  273. ;;; TODO: These 'sort-posts-by' functions are all starting to have
  274. ;;; a noticeable pattern to them. Time to bring out the macros?
  275. (defun sort-posts-by-time (predicate)
  276. "Sort the list of posts by time, according to the given PREDICATE."
  277. (setf *collected-posts*
  278. (stable-sort *collected-posts*
  279. predicate
  280. :key (lambda (post)
  281. (timestamp-to-universal-time
  282. (get-post-time post))))))
  283. (defun sort-posts-by-newest ()
  284. "Sort the list of posts so that the newest posts appear first in the list."
  285. (sort-posts-by-time #'>))
  286. (defun sort-posts-by-oldest ()
  287. "Sort the list of posts so that the oldest posts appear first in the list."
  288. (sort-posts-by-time #'<))
  289. (sort-posts-by-newest)
  290. (mapcar #'print-post-listing *collected-posts*)
  291. ;;; More useful for this time comparison funcationality is filtering!
  292. ;;; TODO: probably want some more sophisticated timestamp parsing here.
  293. (defun filter-posts-between (start-time end-time)
  294. "Return only those posts occurring between START-TIME and END-TIME."
  295. (let ((start-universal-time (timestamp-to-universal-time start-time))
  296. (end-universal-time (timestamp-to-universal-time end-time)))
  297. (remove-if-not
  298. (lambda (post)
  299. (let ((post-universal-time (timestamp-to-universal-time
  300. (get-post-time post))))
  301. (and (>= post-universal-time start-universal-time)
  302. (<= post-universal-time end-universal-time))))
  303. *collected-posts*)))
  304. ;;; Filter only the 'Highlight' posts?
  305. (defun filter-posts-highlight ()
  306. "Return only those posts designated as 'Highlight' posts."
  307. (remove-if-not #'get-post-highlightp *collected-posts*))
  308. ;;; e.g. filter only the Informative posts?
  309. (defun filter-posts-by-rating (rating &optional (greater-than 0))
  310. "Return only those posts with the given amount of RATINGs."
  311. (remove-if-not (lambda (post)
  312. (let ((rating (getf (get-post-ratings post)
  313. (intern (string rating) :keyword))))
  314. (> (if rating
  315. rating
  316. 0)
  317. greater-than)))
  318. *collected-posts*))
  319. ;;; e.g.
  320. (filter-posts-between "Dec 1, 2018 at 7:00 PM"
  321. "Dec 1, 2019 at 7:00 PM")
  322. (filter-posts-highlight)
  323. ;;; Now for a pretty graph! First, we keep track of the post authors and
  324. ;;; the number of times they've posted in a plist
  325. (defun graph-piechart-posts-by-author (posts)
  326. "Draw a pretty pie-chart of the percentage of POSTS by author."
  327. (let ((number-of-posts nil))
  328. (loop
  329. for post in posts
  330. for post-author = (intern (string (get-post-author post)) :keyword)
  331. do
  332. ;; TODO: This bugs me to look at: too much repeated (getf ..) !
  333. ;; Not sure what to do about it though. setf/incf need their
  334. ;; argument to be 'getf-able', so a variable/let binding
  335. ;; doesn't work. Well, whatever.
  336. (if (getf number-of-posts post-author)
  337. (incf (getf number-of-posts post-author))
  338. (setf (getf number-of-posts post-author) 1)))
  339. ;; Make a pretty pie chart!
  340. (net.acceleration.charting:with-chart (:pie 600 370)
  341. (loop for (label value) in
  342. (sort
  343. (loop for (author post-count) on number-of-posts by #'cddr
  344. collect
  345. (let ((post-percentage
  346. (float (* 100 (/ post-count
  347. (length posts))))))
  348. (list (format nil
  349. "~a ~5,2f\%"
  350. (string author)
  351. post-percentage)
  352. post-count)))
  353. #'>
  354. :key (lambda (pair) (cadr pair)))
  355. do
  356. (net.acceleration.charting:add-slice label value))
  357. (net.acceleration.charting:save-file "piechart.png"))))
  358. ;;; More list sorting/analytics for fun
  359. (let ((rating "Informative"))
  360. (mapcar (lambda (post)
  361. (format t
  362. "[URL=~a]Post ~4,'0d[/URL], [COLOR=rgb(84, 172, 210)]~a: ~a=~d[/COLOR]~%"
  363. (get-post-url post)
  364. (get-post-number post)
  365. (get-post-author post)
  366. rating
  367. (getf (get-post-ratings post)
  368. (intern (string rating) :keyword))))
  369. (subseq (sort-posts-by-most rating) 0 10)))
  370. ;;; More informative: make a graph of binned-plot frequency!
  371. ;;; (bin according to days to start with.)
  372. (defun day-start (timestamp)
  373. "Return a new timestamp for the given day, but at 12:00AM."
  374. (let ((parts (cl-ppcre:split "(\\s+)|(,)|(:)" timestamp)))
  375. (format nil
  376. "~a ~a, ~a at 12:00 AM"
  377. (elt parts 0)
  378. (elt parts 1)
  379. (elt parts 3))))
  380. (defun increment-day (universal-time)
  381. "Increments the day for the given UNIVERSAL-TIME."
  382. (+ universal-time (* 60 60 24)))
  383. (sort-posts-by-oldest)
  384. (defparameter *oldest-post* (first *collected-posts*))
  385. (defparameter *oldest-day* (day-start (get-post-time *oldest-post*)))
  386. (defparameter *newest-post* (car (last *collected-posts*)))
  387. (defparameter *newest-day-plus-1*
  388. (day-start (universal-time-to-timestamp
  389. (increment-day (timestamp-to-universal-time
  390. (get-post-time *newest-post*))))))
  391. ;;; I don't really like the line plotting here; the x-axis ticks don't
  392. ;;; line up with the data and there's no control over the viewing pane
  393. ;;; (Workaround: just output these into a csv or something that you
  394. ;;; can load into a spreadsheet to play with?)
  395. (defparameter *binned-frequency*
  396. (loop
  397. for time = (timestamp-to-universal-time *oldest-day*)
  398. then (increment-day time)
  399. until (>= time (timestamp-to-universal-time *newest-day-plus-1*))
  400. collect
  401. (list time
  402. (length (filter-posts-between
  403. (universal-time-to-timestamp time)
  404. (universal-time-to-timestamp (increment-day time)))))))
  405. (with-open-file (stream "freq.csv" :direction :output :if-exists :overwrite)
  406. ;; Header
  407. (format stream "~a,~a,~a~%" "universal time" "day timestamp" "post count")
  408. (loop for post-count-pair in *binned-frequency* do
  409. (format stream
  410. "~d,\"~a\",~d~%"
  411. (car post-count-pair)
  412. (universal-time-to-timestamp (car post-count-pair))
  413. (cadr post-count-pair))))
  414. (net.acceleration.charting:with-chart (:line 700 370)
  415. (net.acceleration.charting:add-series
  416. nil
  417. *binned-frequency*
  418. (net.acceleration.charting:set-axis
  419. :x "time"
  420. :label-formatter (lambda (time)
  421. (multiple-value-bind
  422. (seconds minutes hours day month)
  423. (decode-universal-time time)
  424. (declare (ignore seconds minutes hours))
  425. (format nil
  426. "~2,'0d/~2,'0d"
  427. month
  428. day))))
  429. (net.acceleration.charting:set-axis :y "posts per day")
  430. (net.acceleration.charting:save-file "freq_plot.png")))