A Ruby script to efficiently search through a JSON file of archives of a certain lolcow's exploits over the last two decades.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

176 lines
5.4 KiB

  1. #!/usr/bin/ruby
  2. # A script to efficiently search through a JSON file of archives
  3. # of a certain lolcow's exploits over the last two decades.
  4. #
  5. # Assuming the JSON file is saved as archives.json, run this script
  6. # at the command line using
  7. # ./scan_TrainDodger_archives.rb archives.json
  8. #
  9. # The script downloads local HTML strings for all of the archives
  10. # listed in the JSON file, and defines various functions for
  11. # efficiently searching through the archives for interesting tid-bits.
  12. # The script finally instantiates an IRB repl where the user has
  13. # fine-grained control over further searching and filtering.
  14. #
  15. # Most useful is probably the KEYWORDS_SEARCH(archive, keywords)
  16. # function, which peruses the archive HTML for a given set of keywords
  17. # and returns a hash listing the archive urls and the context for
  18. # easy reading.
  19. require "irb"
  20. require "json"
  21. require "nokogiri"
  22. require "open-uri"
  23. # Download and return the HTML for the webpage at the given URL.
  24. #
  25. # Usage:
  26. # html = get_html_document_object("https://archive.li/XXXXX")
  27. # #=> Nokogiri::HTML::Document ]>
  28. def download_html_document_object(url)
  29. print "Downloading from " + url + "... "
  30. STDOUT.flush
  31. begin
  32. html = Nokogiri::HTML.parse(URI.open(url))
  33. rescue OpenURI::HTTPError
  34. # Didn't download properly (e.g. timeout, bad read, etc)
  35. puts "FAIL"
  36. return nil
  37. end
  38. puts "OK"
  39. return html
  40. end
  41. # Return an array of archive URL strings extracted from the
  42. # given JSON_FILE.
  43. #
  44. # Usage:
  45. # archives = extract_archives_from_json(json_file)
  46. # archives #=> ["https://archive.li/XXXXX", ... ]
  47. def extract_archives_from_json(json_file)
  48. json_string = IO.read(json_file)
  49. JSON::parse(json_string).map { |archive| archive["archived"] }
  50. end
  51. # Return an array of all indices at which a given WORD appears in
  52. # the given TEXT string. The search is case-insensitive.
  53. #
  54. # Usage:
  55. # find_word_indices("she sells seashells", "She")
  56. # #=> [0, 13]
  57. def find_word_indices(text, word)
  58. regex = Regexp.new("(?=" + word + ")", Regexp::IGNORECASE)
  59. text.enum_for(:scan, regex).map { Regexp.last_match.offset(0).first }
  60. end
  61. # Return a 'context' string about the given INDEX in the supplied TEXT.
  62. # i.e. return the substring delimited by
  63. # [INDEX - CONTEXT_LIM, INDEX + CONTEXT_LIM].
  64. # By default, CONTEXT_LIM = 50.
  65. #
  66. # Usage:
  67. # get_context_string("This is a long string of text", 5, 7)
  68. # #=> "This is a lon"
  69. def get_context_string(text, index, context_lim=50)
  70. context_start = [0, index - context_lim].max
  71. context_end = [text.length - 1, index + context_lim].min
  72. text[context_start..context_end]
  73. end
  74. # Search through the given ARCHIVES for each of the given
  75. # KEYWORDS, and return a hash mapping each keyword to a list
  76. # of the archives the keyword was found in (if any) and their
  77. # surrounding contexts.
  78. #
  79. # Usage:
  80. # keywords_search(archives, ["keyword"])
  81. # #=> {"keyword"=>
  82. # # {"http://archive.li/XXXXX"=>
  83. # # ["The keyword appeared in this sentence"]}}
  84. def keywords_search(archives, keywords)
  85. # For efficiency, we loop through the archives and collect all
  86. # of the keywords+context for each archive URLs at once. But this
  87. # isn't a nice way to structure the results, so we invert the
  88. # data structure afterward.
  89. archive_keywords = { }
  90. archives.each do |url, html|
  91. archive_keywords[url] = [ ]
  92. text = html.text
  93. keywords.each do |keyword|
  94. indices = find_word_indices(text, keyword)
  95. unless indices.empty?
  96. context = indices.map { |index| get_context_string(text, index) }
  97. archive_keywords[url] << {keyword => context}
  98. end
  99. end
  100. end
  101. # Invert the hash data structure so that we match keywords to
  102. # archives, instead of the other way around
  103. keywords = { }
  104. archive_keywords.each do |url, keywords_list|
  105. keywords_list.each do |keyword|
  106. keywords[keyword.keys[0]] ||= [ ]
  107. keywords[keyword.keys[0]] << {url=>keyword.values[0]}
  108. end
  109. end
  110. keywords
  111. end
  112. # Print the given SEARCH_RESULTS in a neat, pretty form where
  113. # each keyword is followed by a list of archive URLs and the
  114. # surrounding contexts in which the keyword appears.
  115. def pretty_print_search_results(search_results)
  116. search_results.each do |keyword, appearances|
  117. puts "#{keyword}:"
  118. appearances.each do |appearance|
  119. url = appearance.keys[0]
  120. contexts = appearance.values[0]
  121. puts " #{url}:"
  122. contexts.each do |context|
  123. # Remove any whitespace or newlines
  124. puts " #{context.gsub(/\s+/, " ")}"
  125. end
  126. end
  127. end
  128. end
  129. # Assume the JSON file provided as first command line arg
  130. if ARGV.length < 1
  131. raise(ArgumentError, "No JSON file provided.")
  132. else
  133. json_file = ARGV[0]
  134. # And maybe do something with extra, optional command line args?
  135. end
  136. archive_urls = extract_archives_from_json(json_file)
  137. archive_html = { }
  138. # Download HTML for all archives (check: is this sane? Might
  139. # take too long/send too many requests.)
  140. archive_urls = archive_urls[0..9] # DEBUG
  141. archive_urls.each do |url|
  142. archive_html[url] = download_html_document_object(url)
  143. # Add in a 5s pause so that we don't trip archive.li's
  144. # "Are you a bot?" CAPTCHA
  145. sleep(5)
  146. end
  147. # Finally, instantiate an IRB REPL for interactive searching.
  148. # Available variables: json_file, archive_urls, archive_html
  149. # Available functions: download_html_document_object,
  150. # extract_archives_from_json, find_word_indices, get_context_string,
  151. # keywords_search, pretty_print_search_results
  152. binding.irb