Browse Source

The Ruby script for the searching and filtering

master
Yotsubaaa 4 months ago
parent
commit
8fe9fd9095
1 changed files with 176 additions and 0 deletions
  1. +176
    -0
      scan_TrainDodger_archives.rb

+ 176
- 0
scan_TrainDodger_archives.rb View File

@@ -0,0 +1,176 @@
#!/usr/bin/ruby

# A script to efficiently search through a JSON file of archives
# of a certain lolcow's exploits over the last two decades.
#
# Assuming the JSON file is saved as archives.json, run this script
# at the command line using
# ./scan_TrainDodger_archives.rb archives.json
#
# The script downloads local HTML strings for all of the archives
# listed in the JSON file, and defines various functions for
# efficiently searching through the archives for interesting tid-bits.
# The script finally instantiates an IRB repl where the user has
# fine-grained control over further searching and filtering.
#
# Most useful is probably the KEYWORDS_SEARCH(archive, keywords)
# function, which peruses the archive HTML for a given set of keywords
# and returns a hash listing the archive urls and the context for
# easy reading.


require "irb"
require "json"
require "nokogiri"
require "open-uri"


# Download and return the HTML for the webpage at the given URL.
#
# Usage:
# html = get_html_document_object("https://archive.li/XXXXX")
# #=> Nokogiri::HTML::Document ]>
def download_html_document_object(url)
print "Downloading from " + url + "... "
STDOUT.flush
begin
html = Nokogiri::HTML.parse(URI.open(url))
rescue OpenURI::HTTPError
# Didn't download properly (e.g. timeout, bad read, etc)
puts "FAIL"
return nil
end
puts "OK"
return html
end


# Return an array of archive URL strings extracted from the
# given JSON_FILE.
#
# Usage:
# archives = extract_archives_from_json(json_file)
# archives #=> ["https://archive.li/XXXXX", ... ]
def extract_archives_from_json(json_file)
json_string = IO.read(json_file)
JSON::parse(json_string).map { |archive| archive["archived"] }
end


# Return an array of all indices at which a given WORD appears in
# the given TEXT string. The search is case-insensitive.
#
# Usage:
# find_word_indices("she sells seashells", "She")
# #=> [0, 13]
def find_word_indices(text, word)
regex = Regexp.new("(?=" + word + ")", Regexp::IGNORECASE)
text.enum_for(:scan, regex).map { Regexp.last_match.offset(0).first }
end


# Return a 'context' string about the given INDEX in the supplied TEXT.
# i.e. return the substring delimited by
# [INDEX - CONTEXT_LIM, INDEX + CONTEXT_LIM].
# By default, CONTEXT_LIM = 50.
#
# Usage:
# get_context_string("This is a long string of text", 5, 7)
# #=> "This is a lon"
def get_context_string(text, index, context_lim=50)
context_start = [0, index - context_lim].max
context_end = [text.length - 1, index + context_lim].min
text[context_start..context_end]
end


# Search through the given ARCHIVES for each of the given
# KEYWORDS, and return a hash mapping each keyword to a list
# of the archives the keyword was found in (if any) and their
# surrounding contexts.
#
# Usage:
# keywords_search(archives, ["keyword"])
# #=> {"keyword"=>
# # {"http://archive.li/XXXXX"=>
# # ["The keyword appeared in this sentence"]}}
def keywords_search(archives, keywords)
# For efficiency, we loop through the archives and collect all
# of the keywords+context for each archive URLs at once. But this
# isn't a nice way to structure the results, so we invert the
# data structure afterward.
archive_keywords = { }
archives.each do |url, html|
archive_keywords[url] = [ ]
text = html.text
keywords.each do |keyword|
indices = find_word_indices(text, keyword)
unless indices.empty?
context = indices.map { |index| get_context_string(text, index) }
archive_keywords[url] << {keyword => context}
end
end
end

# Invert the hash data structure so that we match keywords to
# archives, instead of the other way around
keywords = { }
archive_keywords.each do |url, keywords_list|
keywords_list.each do |keyword|
keywords[keyword.keys[0]] ||= [ ]
keywords[keyword.keys[0]] << {url=>keyword.values[0]}
end
end
keywords
end


# Print the given SEARCH_RESULTS in a neat, pretty form where
# each keyword is followed by a list of archive URLs and the
# surrounding contexts in which the keyword appears.
def pretty_print_search_results(search_results)
search_results.each do |keyword, appearances|
puts "#{keyword}:"
appearances.each do |appearance|
url = appearance.keys[0]
contexts = appearance.values[0]
puts " #{url}:"
contexts.each do |context|
# Remove any whitespace or newlines
puts " #{context.gsub(/\s+/, " ")}"
end
end
end
end


# Assume the JSON file provided as first command line arg
if ARGV.length < 1
raise(ArgumentError, "No JSON file provided.")
else
json_file = ARGV[0]
# And maybe do something with extra, optional command line args?
end

archive_urls = extract_archives_from_json(json_file)
archive_html = { }

# Download HTML for all archives (check: is this sane? Might
# take too long/send too many requests.)
archive_urls = archive_urls[0..9] # DEBUG

archive_urls.each do |url|
archive_html[url] = download_html_document_object(url)

# Add in a 5s pause so that we don't trip archive.li's
# "Are you a bot?" CAPTCHA
sleep(5)
end


# Finally, instantiate an IRB REPL for interactive searching.
# Available variables: json_file, archive_urls, archive_html
# Available functions: download_html_document_object,
# extract_archives_from_json, find_word_indices, get_context_string,
# keywords_search, pretty_print_search_results
binding.irb

Loading…
Cancel
Save