#!/usr/bin/perl #ENTITY EXTRACTION DEMO FOR NGRAMMERV3: ./demo-entityextract.pl [OPTIONAL YYYYMMDDHHMMSS DATE] #PREREQS: curl, pigz, Lingua::EN::Tagger, JSON::XS use JSON::XS; use Lingua::EN::Tagger; #create the tagger object... $TAGGER = new Lingua::EN::Tagger; ################################################################################################ #FIRST PERFORM THE KEYWORD SEARCHES... #get the date/time of the file we should process... if ($ARGV[0] =~ /\d\d\d\d\d\d\d\d\d\d\d\d\d\d/) { #we've been handed a specific date/time to work with... $FILEDATETIME = $ARGV[0]; } else { #compute the date two minutes ago and use that as the file we request... ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time - 120); #run the file from two minutes ago... $FILEDATETIME = sprintf("%04d%02d%02d%02d%02d00", $year+1900, $mon+1, $mday, $hour, $min); #round to the start of the minute... } #stream in the latest ngrams update and process it... open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/webngrams/$FILEDATETIME.webngrams.json.gz | pigz -d |"); while() { #loop over each entry... my $ref = decode_json($_); #the tagger we use here is for English, so skip non-English articles... if ($ref->{'lang'} ne 'en' && $ref->{'lang'} ne 'ENGLISH') { next; }; #see how to concat pre+ngram+post... my $search; if ($ref->{'type'} == 2) { $search = $ref->{'pre'} . $ref->{'ngram'} . $ref->{'post'}; } #scriptio continua language... else { $search = $ref->{'pre'} . ' ' . $ref->{'ngram'} . ' ' . $ref->{'post'}; } #space-segmented language... my $searchlc = lc($search); my $searchlen = length($search); #get all noun phrases and permutations thereof... my $tagged = $TAGGER->add_tags($search); my %nps = $TAGGER->get_noun_phrases($tagged); #cycle over all of the extracted entities... make sure it doesn't start or end the snippet, as that could lead to a truncated entity... #ie, "White House" could appear as "House announced today.." where the first word is clipped, so we only match if it is not the start or end... foreach (sort keys %nps) { if (index($searchlc, lc($_), 1) > -1 && index($searchlc, lc($_)) < ($searchlen - length($_))) { $MATCHES_CNT{$ref->{'url'}}{$_}+= (100 - $ref->{'pos'})/100; #record the number of appearances, adjusted by the inverse of decile (appearances higher up count more)... $NUMMATCHES++; }; }; if ($counter++ % 1000 == 0) { print "\tProcessed $counter records...\n"; } } close(FILE); #if no matches (or no ngram file for this minute), exit... if ($NUMMATCHES < 1) { exit; }; ################################################################################################ ################################################################################################ #NOW MERGE WITH THE GDELT ARTICLE LIST TO GET THE METADATA FOR EACH RECORD... #ensure our output directory exists... if (!-e "./RESULTS/") { mkdir("./RESULTS/"); }; open(FILE, "curl -s http://data.gdeltproject.org/gdeltv3/gal/$FILEDATETIME.gal.json.gz | pigz -d |"); open(OUT, ">./RESULTS/$FILEDATETIME.results.json"); while() { #loop over each entry... my $ref = decode_json($_); if (exists($MATCHES_CNT{$ref->{'url'}})) { #this was one of our matches, so output... chomp($_); chop($_); #remove the closing brace... #score the entities... my %entities; foreach (keys %{$MATCHES_CNT{$ref->{'url'}}}) { $entities{$_} = $MATCHES_CNT{$ref->{'url'}}{$_}; #set to the number of appearances adjusted by decile... #$entities{$_} = $MATCHES_CNT{$ref->{'url'}}{$_} * scalar(split/\s+/, $_); #this version corrects for longer phrases appearing less often by multiplying by the number of words... } my $entities; foreach (sort {$entities{$b} <=> $entities{$a} } keys %entities) { $entities .= '{"entity":' . JSON::XS->new->allow_nonref(1)->utf8->encode($_ . '') . ",\"score\": $entities{$_}},"; #also encode to UTF8 as we go, since GAL is UTF8 already and we aren't decoding it... }; chop($entities); print OUT "$_, \"entities\": [ $entities ] }\n"; } } close(OUT); close(FILE); ################################################################################################