#!/usr/bin/perl use JSON::XS; use Text::Similarity::Overlaps; use Date::Parse; #initialize our similarity engine... $SIM = Text::Similarity::Overlaps->new( {'normalize' => 0, 'verbose' => 0} ); ############################################# #load all of the Trump tweets... stored in a huge array, not one per line... #downloaded from http://www.trumptwitterarchive.com/archive foreach $year (2020) { open(FILE, "./$year.json"); read(FILE, $json, (-s FILE)); close(FILE); undef($JSONREF); $JSONREF = decode_json $json; if (!defined($JSONREF)) { print "FATAL TWEETS JSON\n"; exit; } foreach $rec (@{$JSONREF}) { $txt = $rec->{'text'}; $txt=~s/&/&/g; $txt=~s/\t/ /g; $txt=~s/\s+/ /gs; $match = lc($txt); $match=~s/^rt .*?://; $match=~s/http.*\s//; $match=~s/http.*//; $match_min = $match; $match_min=~s/[^a-z0-9]//g; if (length($match_min) < 25) { next; } #make sure this isn't just a link, etc... push(@TWEETS_TXT, $txt); push(@TWEETS_TXTMIN, $match_min); push(@TWEETS_WORDS, [split/\s+/, $match]); if (length($rec->{'id_str'}) < 4) { print "FAIL($year)($txt)\n"; } push(@TWEETS_DETAILS, $rec->{'id_str'} . "\t" . $rec->{'created_at'} . "\t" . $rec->{'is_retweet'}); $unixtime = 0; eval{ $unixtime = str2time($rec->{'created_at'}); }; push(@TWEETS_UNIXDATE, $unixtime); $NUMTWEETS_CORE++; } } #add special deleted video tweet that appears often... $txt = 'TERRIFIED TODLER RUNS FROM RACIST BABY'; $txt=~s/\t/ /g; $txt=~s/\s+/ /gs; push(@TWEETS_TXT, $txt); $match = lc($txt); $match=~s/^rt .*?://; $match=~s/http.*\s//; $match=~s/http.*//; $match_min = $match; $match_min=~s/[^a-z0-9]//g; push(@TWEETS_TXTMIN, $match_min); push(@TWEETS_WORDS, [split/\s+/, $match]); push(@TWEETS_DETAILS, '1273770669214490626' . "\t" . '2020-06-18 8:12PM EST' . "\t" . 'false'); $unixtime = 0; eval{ $unixtime = str2time('2020-06-18 8:12PM EST'); }; push(@TWEETS_UNIXDATE, $unixtime); $NUMTWEETS_CORE++; print "Loaded Core $NUMTWEETS_CORE...\n"; foreach $year (2015...2019) { open(FILE, "./$year.json"); read(FILE, $json, (-s FILE)); close(FILE); undef($JSONREF); $JSONREF = decode_json $json; if (!defined($JSONREF)) { print "FATAL TWEETS JSON\n"; exit;; } foreach $rec (@{$JSONREF}) { $txt = $rec->{'text'}; $txt=~s/&/&/g; $txt=~s/\t/ /g; $txt=~s/\s+/ /gs; $match = lc($txt); $match=~s/^rt .*?://; $match=~s/http.*\s//; $match=~s/http.*//; $match_min = $match; $match_min=~s/[^a-z0-9]//g; if (length($match_min) < 25) { next; } #make sure this isn't just a link, etc... push(@TWEETS_TXT, $txt); push(@TWEETS_TXTMIN, $match_min); push(@TWEETS_WORDS, [split/\s+/, $match]); if (length($rec->{'id_str'}) < 4) { print "FAIL($year)($txt)\n"; } push(@TWEETS_DETAILS, $rec->{'id_str'} . "\t" . $rec->{'created_at'} . "\t" . $rec->{'is_retweet'}); $unixtime = 0; eval{ $unixtime = str2time($rec->{'created_at'}); }; push(@TWEETS_UNIXDATE, $unixtime); $NUMTWEETS_EXT++; } } print "Loaded Core+Extended $NUMTWEETS_EXT...\n"; ############################################# ############################################# #now load all of the TV airtime seconds containing Trump's Twitter handle... these are one-per-line... open(OUTNOMATCH, ">$ARGV[0].nomatch.txt"); binmode(OUTNOMATCH, ":utf8"); open(OUTMATCH, ">$ARGV[0].match.txt"); binmode(OUTMATCH, ":utf8"); open(FILE, $ARGV[0]); $ROWID=-1; while() { $_=~s/\s+$//; #extract the fields... undef($JSONREF); eval{ $JSONREF = decode_json $_; }; if (!defined($JSONREF)) { next; } $date = ''; $date = $JSONREF->{'date'}; $unixtime = 0; eval{ $unixtime = str2time($date); }; $station = ''; $station = $JSONREF->{'station'}; $showName = ''; $showName = $JSONREF->{'showName'}; $iaShowId = ''; $iaShowId = $JSONREF->{'iaShowId'}; $iaClipUrl = ''; $iaClipUrl = $JSONREF->{'iaClipUrl'}; $OCRText = ''; $OCRText = $JSONREF->{'OCRText'}; $OCRText=~s/\t/ /g; $OCRText=~s/\s+/ /gs; $match = lc($OCRText); $match=~s/^rt .*?://; $match=~s/http.*\s//; $match=~s/http.*//; $match_min = $match; $match_min=~s/[^a-z0-9]//g; $OCRTextMin = $match_min; $OCRTextClean = $match; $ROWID++; #when we move from show to show like MSNBCW_20200101_170000_MSNBC_Live to MSNBCW_20200101_180000_MSNBC_Live, we'll sometimes end up with a transition mid-second so we'll have two rows with "2020-01-01 18:00:57 UTC MSNBC"... so this nixes those... if (exists($SEEN{"$date\t$station"})) { $SEEN{"$date\t$station"} = 1; next; }; $SEEN{"$date\t$station"} = 1; #and split into words... undef(%ocrwords); foreach $word (split/\s+/, $OCRTextClean) { $ocrwords{$word} = 1; } if (length($OCRText) < 10) { $OCRTextCleanLAST = ''; $MAX_IDLAST = -1; next; } if ($counter++ % 1000 == 0) { print "\tProcessed $counter...\n\n\n"; } ############################################ #compute the difference between this OCR line and the one before it and if its similar enough, just reuse our match and move to the next line... $ocrmatchlast = 0; if ($OCRTextCleanLAST eq $OCRTextClean) { $ocrmatchlast = 1; } else { ($score, %scores) = $SIM->getSimilarityStrings($OCRTextCleanLAST, $OCRTextClean); if ($scores{'cosine'} > 0.7) { $ocrmatchlast = 1; } } #copy this line so in case we bail below we have this saved for our next loop... $OCRTextCleanLAST = $OCRTextClean; #and take action if this was a close match to the last line... if ($ocrmatchlast == 1) { #if this line was similar enough to the last one, just use its results and move on... if ($MAX_IDLAST > -1) { #the last line had a match, so just copy that match forward... print OUTMATCH "$date\t$station\t$showName\t$iaShowId\t$iaClipUrl\t$TWEETS_DETAILS[$MAX_IDLAST]\t$TWEETS_TXT[$MAX_IDLAST]\t$OCRText\n"; next; } else { #the last line had no match and we are identical to it, so copy to our nonmatch output... print OUTNOMATCH "$date\t$station\t$showName\t$iaShowId\t$iaClipUrl\t$OCRText\n"; next; } } ############################################ ############################################ #otherwise this is a new line so loop over the "core" (2020) tweet collection looking for a match... $MAX_SIM = 0; $MAX_ID = -1; #print "\tSearching Core Tweets...\n"; for($i=0;$i<$NUMTWEETS_CORE;$i++) { if ($TWEETS_UNIXDATE[$i] > $unixtime) { next; } #skip tweets that are newer than this second of airtime so we don't get false future matches... if (index($OCRTextMin, $TWEETS_TXTMIN[$i]) > -1) { $MAX_ID = $i; #tweet found in its entirety... done... last; } else { #otherwise do a similarity check... $tot_words = 0; $tot_matched = 0; foreach $word (@{$TWEETS_WORDS[$i]}) { $tot_words++; if (exists($ocrwords{$word})) { $tot_matched++; }; } $perc = ($tot_matched / $tot_words) * 100; if ($perc > 90 && $perc > $MAX_SIM) { $MAX_SIM = $perc; $MAX_ID = $i; } } } ############################################ #if no match try against the extended complete set of all tweets of the last 5 years... if ($MAX_ID < 0) { #print "\tSearching Extended Tweets...\n"; $MAX_SIM = 0; $MAX_ID = -1; for($i=$NUMTWEETS_CORE;$i<($NUMTWEETS_CORE + $NUMTWEETS_EXT);$i++) { if ($TWEETS_UNIXDATE[$i] > $unixtime) { next; } #skip tweets that are newer than this second of airtime so we don't get false future matches... if (index($OCRTextMin, $TWEETS_TXTMIN[$i]) > -1) { $MAX_ID = $i; #tweet found in its entirety... last; } else { #otherwise do a similarity check... $tot_words = 0; $tot_matched = 0; foreach $word (@{$TWEETS_WORDS[$i]}) { $tot_words++; if (exists($ocrwords{$word})) { $tot_matched++; }; } $perc = ($tot_matched / $tot_words) * 100; if ($perc > 90 && $perc > $MAX_SIM) { $MAX_SIM = $perc; $MAX_ID = $i; } } } } ############################################ ############################################ #copy over our match... $MAX_IDLAST = $MAX_ID; ############################################ ############################################ #write match or nomatch and move on to the next OCR line... if ($MAX_ID < 0) { print OUTNOMATCH "$date\t$station\t$showName\t$iaShowId\t$iaClipUrl\t$OCRText\n"; } else { print OUTMATCH "$date\t$station\t$showName\t$iaShowId\t$iaClipUrl\t$TWEETS_DETAILS[$MAX_ID]\t$TWEETS_TXT[$MAX_ID]\t$OCRText\n"; } ############################################ } close(FILE); close(OUTMATCH); close(OUTNOMATCH); #############################################