# This program extracts data from "SEN10RS The Good, the Bad, and the Really Really Bad", # Josh Tubb's 2009 NaNoWriMo project. # I made some probably-obvious hand edits to the following entries in the text to make the format consistent: # 35, 102, 191, 192, 238, 298, 324, 333, 334 require 'rubygems' require 'fastercsv' CATEGORIES = ['tubbs', 'tool', %w{emo goth punk scene},'wannabe', 'nerd','annoying', 'promiscuous five', 'band', 'timberlane player', 'fake', %w{gay lesbian}, 'sporty', 'jock', %w{queit quiet}, 'special education', 'advanced placement', 'gray', 'pac', 'artistic', 'different', 'teacher', 'druggie', 'involved', 'living life', 'gangstah', 'stupid', 'nice', 'bwitch'] # get a list of booleans for the specified categories def categories(cat_texts) fields = CATEGORIES.collect do |category| if category.is_a?(Array) category.any? do |c| re = Regexp.new(c) cat_texts.any?{|ct| ct=~re} end else re = Regexp.new(category) cat_texts.any?{|ct| ct=~re} end end raise "Doesn't have any categories: #{cat_texts}" unless fields.any? fields end #names of categories (for CSV header) def cat_headers CATEGORIES.collect{|c| c.is_a?(Array) ? c.join('/') : c} end FasterCSV.open("tubbs.csv", "w") do |csv| File.open('tubbs.txt', "r") do |infile| csv << ['number', 'first name', 'last name', 'cool?' , 'rank', *cat_headers] while (line = infile.gets) if line=~/^(\d+.\s+\w+ \w+\s*\w*:)/ fields = line.split('.') number = fields[0].strip name_and_categories=fields[1].strip.split(/:\s*/) name = name_and_categories[0].split(' ') fname = name[0] lname = name[1..-1].join(' ') cool = fields[name_and_categories[2] ? 2 : 3].split(/,\s+/)[0].strip.downcase cats = name_and_categories[1].split(/\/\s*/).map{|s| s.strip.downcase} rank = (name_and_categories[2] || fields[2]).strip.downcase if rank=~/–infinity/ rank = "-100" elsif rank=~/infinity/ rank = "100" end begin csv << [number, fname, lname, cool, rank, *categories(cats)] rescue Exception=>e raise "Problem with #{fname} #{lname}: #{e.message}" end end end end end