#!/usr/bin/perl #(2/15/2015): Sample script to create a dashboard map of global conflict from GDELT using CartoDB - updated each day. #NOTE: this script is designed to work with GDELT 1.0 daily update files and must be modified to work with GDELT 2.0 files. It requires that you have some kind of automated cronjob or equivalent that automatically downloads the latest GDELT 1.0 event file update each morning, unzips it, and then runs this script. #set the following variables... $EVENTDIRECTORY = ''; #the full path on your system to the directory you are downloading the event files to... $CARTODBACCOUNT = ''; #this is the "username" of your CartoDB account... $CARTODBAPIKEY = ''; #the "api_key" key that CartoDB generates for your account (see CartoDB's documentation for where to find this)... #initialize LWP to communicate with CartoDB's API... use LWP::UserAgent; $useragent = LWP::UserAgent->new; $useragent->agent("GDELT Dashboard Mapper"); #set our rundate to be the day before... ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time-(86400)); $RUNDATE = sprintf("%d%02d%02d",$year+1900,$mon+1,$mday); ################################################################################################################################ open(FILE, "$EVENTDIRECTORY/$RUNDATE.export.CSV"); while() { @vars = split/\t/, $_; $ccgns = uc($vars[51]); $vars[57]=~s/\s+$//; if (!($vars[49] == 1 && $vars[51] eq 'US') && $vars[1]==$RUNDATE && (abs($vars[53]) > 0 || abs($vars[54]) > 0)) { $found = 0; if ($vars[28] == 14) { $key = "$vars[53]\t$vars[54]\t1"; $EVENTS{$key}+=$vars[33]; $LOCATIONS{$key} = $vars[50]; #if ($vars[57]=~/http/) {$URL{$key} .= $vars[57] ."\n";} $IDSATKEY{$key} .= $vars[0] . ','; $EVENTIDS{$vars[0]} = 1; } if ($vars[29] == 4 && $vars[12] ne '' && $vars[22] ne '') { $key = "$vars[53]\t$vars[54]\t2"; $EVENTS{$key}+=$vars[33]; $LOCATIONS{$key} = $vars[50]; $IDSATKEY{$key} .= $vars[0] . ','; $EVENTIDS{$vars[0]} = 1; } } } close(FILE); ################################################################################################################################ ################################################################################################################################ open(FILE, "$EVENTDIRECTORY/$RUNDATE.gkg.csv"); while() { @fields = split/\t/, $_; #0=date, 1=numarts, 2=setcounts, 3=themes, 4=locations, 5=persons, 6=orgs, 7=tone, 8=cameoevents, 9=sources, 10=sourceurls $foundevent = 0; foreach $eventid (split/,/, $fields[8]) { if (exists($EVENTIDS{$eventid})) { $foundevent = 1; } } if ($foundevent == 1) { foreach $eventid (split/,/, $fields[8]) { $EVENTTHEMES{$eventid} .= $fields[3]; $EVENTSETCOUNTS{$eventid} .= $fields[2]; $EVENTURLS{$eventid} .= $fields[10] . ''; } foreach $url (split//, $fields[10]) { $URLSCORE{$url}+= (length($fields[2])*2 + length($fields[3])); if ($fields[3]=~/conflict/i || $fields[3]=~/milit/i || $fields[3]=~/terror/i || $fields[3]=~/protest/i) { $URLSCORE{$url}+=(length($fields[2])*2 + length($fields[3]))*2; } #give it a big boost for relevant themes... if ($fields[2]=~/protest/ || $fields[2]=~/kill/) { $URLSCORE{$url}+=(length($fields[2])*2 + length($fields[3]))*2; } #another boost if it has relevant counts in it... } } } close(FILE); ################################################################################################################################ ################################################################################################################################ open(OUT, ">./GDELTLIVE.CSV"); print OUT "Lat\tLong\tType\tCount\tLocation\tURLs\n"; foreach $key (keys %EVENTS) { if ($EVENTS{$key} >= 3) { #here we operate at the level of the city and protest/violence, with all events collapsed at that level... ($lat, $long, $type) = split/\t/, $key; $themes = ''; $setcounts = ''; $urls = ''; foreach $eventid (split/,/, $IDSATKEY{$key}) { $themes .= $EVENTTHEMES{$eventid}; $setcounts .= $EVENTSETCOUNTS{$eventid}; $urls .= $EVENTURLS{$eventid} . ''; } $dispthemes = ''; undef(%hash); foreach $lkey (split /;/, $themes) {$hash{$lkey}++;} foreach $lkey (sort {$hash{$b} <=> $hash{$a} } keys %hash) { $dispthemes .= "$lkey,"; } $brief = ''; if ($themes=~/conflict/i || $themes=~/milit/i || $themes=~/terror/i) {$brief .= 'Violence,';} if ($themes=~/protest/i) {$brief .= 'Protest,';} $urls=~s/\s+//g; ############ $linkurls = ''; $cnt = 0; undef(%hash); foreach $url (split//, $urls) { $hash{$url}=$URLSCORE{$url};} foreach $url (sort {$hash{$b} <=> $hash{$a}} keys %hash) { ($domain) = $url=~/(http[s]*\/\/[^\/]+\/)/; if ($cnt < 3 && length($url) > 5 && $url=~/http/i) {$linkurls .= " $domain
"; $cnt++; $LINKRAND++;} } ############ $gnewsquery = $LOCATIONS{$key}; $gnewsquery=~s/,.*?,//; $gnewsquery=~s/'"//g; if ($type == 1) {$gnewsquery.=' protests';} else {$gnewsquery.=' violence';} $gnewsquery =~ s/([^^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%0x", ord $1 /eg; $linkurls.=" Google News
"; $LINKRAND++; ############ if ($type == 1) {$typehuman = "Protests";} else {$typehuman = "Violence Against Civilians";} $LOCATIONS{$key}=~s/'"//g; if ($brief ne '') {print OUT "$key\t$EVENTS{$key}\t$LOCATIONS{$key}\t$linkurls\n";} } } close(OUT); ################################################################################################################################ ################################################################################################################################ #now finally we are ready to upload it all to CartoDB via web-based API calls... basically we loop over the file and issue each insert as a separate SQL query... #first issue the query to clear out the table of yesterday's events... print "Truncating GDELTLive...\n"; $response = $useragent->request(HTTP::Request->new(GET=>"http://$CARTODBACCOUNT.cartodb.com/api/v2/sql?q=truncate gdeltlive&api_key=$CARTODBAPIKEY")); if (!$response->is_success) { print "ERROR: '" . $response->status_line . "\n"; } #and now loop over the events from today and insert them all... open(FILE, "./GDELTLIVE.CSV"); $line = 0; while() { if ($line == 0) { goto nextline; } ($lat, $long, $type, $count, $location, $urls) = split/\t/, $_; $urls=~s/\s+$//; $location=~s/\?//g; #special just for location (URLs can have question marks in them)... $location=~s/'//g; $urls=~s/'/"/g; $location =~ s/([^^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%0x", ord $1 /eg; $urls =~ s/([^^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%0x", ord $1 /eg; print "\tInserting $line...\n"; $response = $useragent->request(HTTP::Request->new(GET=>"http://$CARTODBACCOUNT.cartodb.com/api/v2/sql?q=insert into gdeltlive (the_geom,count,location,type,urls) values (ST_SetSRID(ST_Point($long, $lat),4326), $count, '$location', $type, '$urls')&api_key=$CARTODBAPIKEY")); if (!$response->is_success) { print "ERROR: '" . $response->status_line . "\n"; } nextline: $line++; } close(FILE); print "Uploaded $line lines...\n"; ################################################################################################################################