#!/usr/bin/perl #ENTITY EXTRACTION DEMO FOR NGRAMMERV3: ./demo-docsentiment.pl [OPTIONAL YYYYMMDDHHMMSS DATE] #PREREQS: curl, pigz, JSON::XS use JSON::XS; ################################################ #load our dictionary of terms to look for... open(FILE, "./SENTIMENT-POS.TXT"); binmode(FILE, ":utf8"); while() { $_=~s/#.*//; #strip comments... $_=~s/\s+$//; if (length($_) < 3) { next; }; #skip blank lines... $EMOT_POS{$_} = 1; } close(FILE); open(FILE, "./SENTIMENT-NEG.TXT"); binmode(FILE, ":utf8"); while() { $_=~s/#.*//; #strip comments... $_=~s/\s+$//; if (length($_) < 3) { next; }; #skip blank lines... $EMOT_NEG{$_} = 1; } close(FILE); ################################################ ################################################################################################ #FIRST PERFORM THE KEYWORD SEARCHES... #get the date/time of the file we should process... if ($ARGV[0] =~ /\d\d\d\d\d\d\d\d\d\d\d\d\d\d/) { #we've been handed a specific date/time to work with... $FILEDATETIME = $ARGV[0]; } else { #compute the date two minutes ago and use that as the file we request... ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time - 120); #run the file from two minutes ago... $FILEDATETIME = sprintf("%04d%02d%02d%02d%02d00", $year+1900, $mon+1, $mday, $hour, $min); #round to the start of the minute... } #stream in the latest ngrams update and process it... open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/webngrams/$FILEDATETIME.webngrams.json.gz | pigz -d |"); while() { #loop over each entry... my $ref = decode_json($_); #the sentiment dictionary we use here is for English, so skip non-English articles... if ($ref->{'lang'} ne 'en' && $ref->{'lang'} ne 'ENGLISH') { next; }; #handle scoring for single words... my $ngram = lc($ref->{'ngram'}); $ngram=~s/^\p{Common}*//; $ngram=~s/\p{Common}*$//; $SCORE_POS{$ref->{'url'}} += $EMOT_POS{$ngram}; $SCORE_NEG{$ref->{'url'}} += $EMOT_NEG{$ngram}; if ($EMOT_POS{$ngram} > 0 || $EMOT_NEG{$ngram} > 0) { $SCORE_CNT{$ref->{'url'}}++; $NUMMATCHES++; } #if this was a word, count the total words... if ($ngram=~/\p{Letter}/) { $SCORE_TOTWORDS{$ref->{'url'}}++; } #handle scoring for two-word phrases... any internal punctuation will prevent a match, so "good, riddance" won't match "good riddance"... my $ngram = lc($ref->{'ngram'} . ' ' . (split /\s+/, lc($ref->{'post'}))[0]); $ngram=~s/^\p{Common}*//; $ngram=~s/\p{Common}*$//; $SCORE_POS{$ref->{'url'}} += $EMOT_POS{$ngram}; $SCORE_NEG{$ref->{'url'}} += $EMOT_NEG{$ngram}; if ($EMOT_POS{$ngram} > 0 || $EMOT_NEG{$ngram} > 0) { $SCORE_CNT{$ref->{'url'}}++; $NUMMATCHES++; } if ($counter++ % 10000 == 0) { print "\tProcessed $counter records...\n"; } } close(FILE); #if no matches (or no ngram file for this minute), exit... if ($NUMMATCHES < 1) { exit; }; ################################################################################################ ################################################################################################ #NOW MERGE WITH THE GDELT ARTICLE LIST TO GET THE METADATA FOR EACH RECORD... #ensure our output directory exists... if (!-e "./RESULTS/") { mkdir("./RESULTS/"); }; open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/gal/$FILEDATETIME.gal.json.gz | pigz -d |"); open(OUT, ">./RESULTS/$FILEDATETIME.results.json"); while() { #loop over each entry... my $ref = decode_json($_); if (exists($SCORE_CNT{$ref->{'url'}}) && $SCORE_TOTWORDS{$ref->{'url'}} > 0) { #this was one of our matches, so output... chomp($_); chop($_); #remove the closing brace... #compute our scores... my $valence = sprintf("%0.1f", $SCORE_CNT{$ref->{'url'}} / $SCORE_TOTWORDS{$ref->{'url'}} * 100) + 0.0; my $emot = ($SCORE_POS{$ref->{'url'}} - $SCORE_NEG{$ref->{'url'}} ) + 0; print OUT "$_, \"valence\": $valence, \"emotion\": $emot }\n"; } } close(OUT); close(FILE); ################################################################################################