|
#!/usr/bin/ruby
|
|
|
|
# A script to efficiently search through a JSON file of archives
|
|
# of a certain lolcow's exploits over the last two decades.
|
|
#
|
|
# Assuming the JSON file is saved as archives.json, run this script
|
|
# at the command line using
|
|
# ./scan_TrainDodger_archives.rb archives.json
|
|
#
|
|
# The script downloads local HTML strings for all of the archives
|
|
# listed in the JSON file, and defines various functions for
|
|
# efficiently searching through the archives for interesting tid-bits.
|
|
# The script finally instantiates an IRB repl where the user has
|
|
# fine-grained control over further searching and filtering.
|
|
#
|
|
# Most useful is probably the KEYWORDS_SEARCH(archive, keywords)
|
|
# function, which peruses the archive HTML for a given set of keywords
|
|
# and returns a hash listing the archive urls and the context for
|
|
# easy reading.
|
|
|
|
|
|
require "irb"
|
|
require "json"
|
|
require "nokogiri"
|
|
require "open-uri"
|
|
|
|
|
|
# Download and return the HTML for the webpage at the given URL.
|
|
#
|
|
# Usage:
|
|
# html = get_html_document_object("https://archive.li/XXXXX")
|
|
# #=> Nokogiri::HTML::Document ]>
|
|
def download_html_document_object(url)
|
|
print "Downloading from " + url + "... "
|
|
STDOUT.flush
|
|
begin
|
|
html = Nokogiri::HTML.parse(URI.open(url))
|
|
rescue OpenURI::HTTPError
|
|
# Didn't download properly (e.g. timeout, bad read, etc)
|
|
puts "FAIL"
|
|
return nil
|
|
end
|
|
puts "OK"
|
|
return html
|
|
end
|
|
|
|
|
|
# Return an array of archive URL strings extracted from the
|
|
# given JSON_FILE.
|
|
#
|
|
# Usage:
|
|
# archives = extract_archives_from_json(json_file)
|
|
# archives #=> ["https://archive.li/XXXXX", ... ]
|
|
def extract_archives_from_json(json_file)
|
|
json_string = IO.read(json_file)
|
|
JSON::parse(json_string).map { |archive| archive["archived"] }
|
|
end
|
|
|
|
|
|
# Return an array of all indices at which a given WORD appears in
|
|
# the given TEXT string. The search is case-insensitive.
|
|
#
|
|
# Usage:
|
|
# find_word_indices("she sells seashells", "She")
|
|
# #=> [0, 13]
|
|
def find_word_indices(text, word)
|
|
regex = Regexp.new("(?=" + word + ")", Regexp::IGNORECASE)
|
|
text.enum_for(:scan, regex).map { Regexp.last_match.offset(0).first }
|
|
end
|
|
|
|
|
|
# Return a 'context' string about the given INDEX in the supplied TEXT.
|
|
# i.e. return the substring delimited by
|
|
# [INDEX - CONTEXT_LIM, INDEX + CONTEXT_LIM].
|
|
# By default, CONTEXT_LIM = 50.
|
|
#
|
|
# Usage:
|
|
# get_context_string("This is a long string of text", 5, 7)
|
|
# #=> "This is a lon"
|
|
def get_context_string(text, index, context_lim=50)
|
|
context_start = [0, index - context_lim].max
|
|
context_end = [text.length - 1, index + context_lim].min
|
|
text[context_start..context_end]
|
|
end
|
|
|
|
|
|
# Search through the given ARCHIVES for each of the given
|
|
# KEYWORDS, and return a hash mapping each keyword to a list
|
|
# of the archives the keyword was found in (if any) and their
|
|
# surrounding contexts.
|
|
#
|
|
# Usage:
|
|
# keywords_search(archives, ["keyword"])
|
|
# #=> {"keyword"=>
|
|
# # {"http://archive.li/XXXXX"=>
|
|
# # ["The keyword appeared in this sentence"]}}
|
|
def keywords_search(archives, keywords)
|
|
# For efficiency, we loop through the archives and collect all
|
|
# of the keywords+context for each archive URLs at once. But this
|
|
# isn't a nice way to structure the results, so we invert the
|
|
# data structure afterward.
|
|
archive_keywords = { }
|
|
archives.each do |url, html|
|
|
archive_keywords[url] = [ ]
|
|
text = html.text
|
|
keywords.each do |keyword|
|
|
indices = find_word_indices(text, keyword)
|
|
unless indices.empty?
|
|
context = indices.map { |index| get_context_string(text, index) }
|
|
archive_keywords[url] << {keyword => context}
|
|
end
|
|
end
|
|
end
|
|
|
|
# Invert the hash data structure so that we match keywords to
|
|
# archives, instead of the other way around
|
|
keywords = { }
|
|
archive_keywords.each do |url, keywords_list|
|
|
keywords_list.each do |keyword|
|
|
keywords[keyword.keys[0]] ||= [ ]
|
|
keywords[keyword.keys[0]] << {url=>keyword.values[0]}
|
|
end
|
|
end
|
|
keywords
|
|
end
|
|
|
|
|
|
# Print the given SEARCH_RESULTS in a neat, pretty form where
|
|
# each keyword is followed by a list of archive URLs and the
|
|
# surrounding contexts in which the keyword appears.
|
|
def pretty_print_search_results(search_results)
|
|
search_results.each do |keyword, appearances|
|
|
puts "#{keyword}:"
|
|
appearances.each do |appearance|
|
|
url = appearance.keys[0]
|
|
contexts = appearance.values[0]
|
|
puts " #{url}:"
|
|
contexts.each do |context|
|
|
# Remove any whitespace or newlines
|
|
puts " #{context.gsub(/\s+/, " ")}"
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
# Assume the JSON file provided as first command line arg
|
|
if ARGV.length < 1
|
|
raise(ArgumentError, "No JSON file provided.")
|
|
else
|
|
json_file = ARGV[0]
|
|
# And maybe do something with extra, optional command line args?
|
|
end
|
|
|
|
archive_urls = extract_archives_from_json(json_file)
|
|
archive_html = { }
|
|
|
|
# Download HTML for all archives (check: is this sane? Might
|
|
# take too long/send too many requests.)
|
|
archive_urls = archive_urls[0..9] # DEBUG
|
|
|
|
archive_urls.each do |url|
|
|
archive_html[url] = download_html_document_object(url)
|
|
|
|
# Add in a 5s pause so that we don't trip archive.li's
|
|
# "Are you a bot?" CAPTCHA
|
|
sleep(5)
|
|
end
|
|
|
|
|
|
# Finally, instantiate an IRB REPL for interactive searching.
|
|
# Available variables: json_file, archive_urls, archive_html
|
|
# Available functions: download_html_document_object,
|
|
# extract_archives_from_json, find_word_indices, get_context_string,
|
|
# keywords_search, pretty_print_search_results
|
|
binding.irb
|