#!/usr/bin/perl open(FILE, $ARGV[0]); $LINEID = 0; while() { if ($LINEID++ == 0) { next; } ($url, $datetime, $startion, $show, $showid, $thumb, $snippet) = split/,/, $_, 7; $snippet=substr($snippet, 1, length($snippet) - 2); #get rid of the beginning and ending quote marks... $snippet = lc($snippet); #for single words... #$word = ''; ($ignore, $ignore, $word) = $snippet=~/trump (is|was|are|were) (a|an|the) ([^\s]+)/; #if ($word eq '') { ($ignore, $word) = $snippet=~/trump (is|was|are|were) ([^\s]+)/; } #for phrases... $word = ''; ($ignore, $ignore, $word) = $snippet=~/trump (is|was|are|were) (a|an|the) ([^\s]+ [^\s]+)/; if ($word eq '') { ($ignore, $word) = $snippet=~/trump (is|was|are|were) ([^\s]+ [^\s]+)/; } #clean up... $word=~s/^[^a-z0-9]+//; $word=~s/[^a-z0-9]+$//; $word=~s/[,\.\?].*//; $word=~s/\b(and|or|was|were|but|in|that|the)$//; $word=~s/\s+$//; if ($word eq '') { next; } #means the "trump is" ended the snippet... #print "($word) ($snippet)\n"; $HASH{$word}++; } close(FILE); open(OUT, ">$ARGV[0].hist"); foreach $word ( sort { $HASH{$b} <=> $HASH{$a} } keys %HASH) { print OUT "$word\t$HASH{$word}\n"; } close(OUT);