#!/usr/bin/perl #this is the final step, which takes the output of "makehist.pl" and brings it all together into a single timeline... ############################################# #first cycle over the entire cache and compile our background probability table and load the words into memory... print "Compiling probability table...\n"; foreach $file (glob("./CACHE/*.hist")) { #get its vitals first for hashing... ($station, $date) = $file=~/CACHE\/([A-Z]+)\.(\d\d\d\d\d\d)\./; $MASTER_STATIONS{$station} = 1; $MASTER_DATES{$date} = 1; #compile the file... open(FILE, $file); while() { ($word, $count) = split/\t/, $_; $count+=0; $MASTER_HASH{$date}{$station}{$word}+=$count; if ($count > 1) { $MASTER_WORDS{$word}++; } } close(FILE); $MASTER_BGCOUNT++; } @STATIONS = sort keys %MASTER_STATIONS; ############################################# ############################################# #now loop by date and station to construct the final TFIDF timeline... foreach $date (sort {$a <=> $b} keys %MASTER_DATES) { $wrotedate = 0; foreach $station (@STATIONS) { #foreach $station (@STATIONS) { # foreach $date (sort {$a <=> $b} keys %MASTER_DATES) { undef(%hash); foreach $word (keys %{$MASTER_HASH{$date}{$station}}) { $hash{$word} = $MASTER_HASH{$date}{$station}{$word} * log($MASTER_BGCOUNT/($MASTER_WORDS{$word}+1)); } $list = ''; $topcnt = 0; foreach $word (sort { $hash{$b} <=> $hash{$a} } keys %hash) { if ($topcnt++ < 5) { $list .= "$word, "; } } $list=~s/,\s*$//; $humandate = ''; if ($wrotedate++ == 0) { $humandate = substr($date, 4, 2) + 0 . '/' . substr($date, 0, 4); } print "$humandate\t$station\t$list\n"; } } #############################################