#!/usr/bin/perl #KEYWORD SCANNER DEMO FOR NGRAMMERV3: ./demo-keywordalerts.pl [OPTIONAL YYYYMMDDHHMMSS DATE] #PREREQS: curl, pigz use JSON::XS; use Unicode::Normalize; ################################################ #load our dictionary of terms to look for... FORMAT: keyphrasecaseinsensitive(0=cased,1=caseinsensitive)comma-sep list of themes open(FILE, "./KEYWORDS.TXT"); binmode(FILE, ":utf8"); while() { $_=~s/#.*//; #strip comments... $_=~s/\s+$//; if (length($_) < 3) { next; }; #skip blank lines... ($keyphrase, $cased, $themes) = split/\t/, $_; #split... $keyphrase = NFC($keyphrase); #normalize to NFC to match our ngrams... if ($cased == 1) { #case-sensitive... push(@KEYWORDS_CASED, $keyphrase); push(@THEMES_CASED, $themes); $NUM_CASED++; } else { #case-insensitive... push(@KEYWORDS_LCASED, lc($keyphrase) ); push(@THEMES_LCASED, $themes); $NUM_LCASED++; } } close(FILE); ################################################ ################################################################################################ #FIRST PERFORM THE KEYWORD SEARCHES... #get the date/time of the file we should process... if ($ARGV[0] =~ /\d\d\d\d\d\d\d\d\d\d\d\d\d\d/) { #we've been handed a specific date/time to work with... $FILEDATETIME = $ARGV[0]; } else { #compute the date two minutes ago and use that as the file we request... ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time - 120); #run the file from two minutes ago... $FILEDATETIME = sprintf("%04d%02d%02d%02d%02d00", $year+1900, $mon+1, $mday, $hour, $min); #round to the start of the minute... } #stream in the latest ngrams update and process it... open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/webngrams/$FILEDATETIME.webngrams.json.gz | pigz -d |"); while() { #loop over each entry... my $ref = decode_json($_); #see how to concat pre+ngram+post... my $search; if ($ref->{'type'} == 2) { $search = $ref->{'pre'} . $ref->{'ngram'} . $ref->{'post'}; } #scriptio continua language... else { $search = $ref->{'pre'} . ' ' . $ref->{'ngram'} . ' ' . $ref->{'post'}; } #space-segmented language... #loop over each cased keyword and look for a match... this is pathologically inefficient but allows for simplistic code for this demonstration... for(my $i=0;$i<$NUM_CASED;$i++) { if (index($search, $KEYWORDS_CASED[$i]) > -1) { foreach (split/,/, $THEMES_CASED[$i]) { $MATCHES{$ref->{'url'}}{$_} = 1; }; $NUMMATCHES++; }; } if ($NUM_LCASED > 0) { $search = lc($search); for(my $i=0;$i<$NUM_LCASED;$i++) { if (index($search, $KEYWORDS_LCASED[$i]) > -1) { foreach (split/,/, $THEMES_LCASED[$i]) { $MATCHES{$ref->{'url'}}{$_} = 1; }; $NUMMATCHES++; }; } } } close(FILE); #if no matches (or no ngram file for this minute), exit... if ($NUMMATCHES < 1) { exit; }; ################################################################################################ ################################################################################################ #NOW MERGE WITH THE GDELT ARTICLE LIST TO GET THE METADATA FOR EACH RECORD... #ensure our output directory exists... if (!-e "./RESULTS/") { mkdir("./RESULTS/"); }; open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/gal/$FILEDATETIME.gal.json.gz | pigz -d |"); open(OUT, ">./RESULTS/$FILEDATETIME.results.json"); while() { #loop over each entry... my $ref = decode_json($_); if (exists($MATCHES{$ref->{'url'}})) { #this was one of our matches, so output... chomp($_); chop($_); #remove the closing brace... my $themes; foreach (sort keys %{$MATCHES{$ref->{'url'}}}) { $themes .= $_ . ','; }; chop($themes); $themes = JSON::XS->new->allow_nonref(1)->encode($themes . ''); print OUT "$_, \"themes\": $themes }\n"; } } close(OUT); close(FILE); ################################################################################################