#!/usr/bin/perl #NOTE - this script is NOT the most efficient way of building the final CSV, but is structured this way to make it easy to extend this code to your own needs, such as general purpose proximity clustering with the GKG 2.0 format... $INFILE = $ARGV[0]; $THEME = $ARGV[1]; if (!-e $INFILE || $THEME eq '') { print "USAGE: ./parsebqcsvtogeojson.pl INFILE THEME\n"; exit; } open(FILE, $INFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($DATE, $DocumentIdentifier, $SourceCommonName, $V2Themes, $V2Locations, $V2Tone, $SharingImage, $TranslationInfo) $sharingimage = $columns[6]; $domain = $columns[2]; $url = $columns[1]; $translationinfo = $columns[7]; $V2Tone = $columns[5]; $V2Themes = $columns[3]; $V2Locations = $columns[4]; ################## ################## #do any post-filtering you want to do here to refine your results after exporting from BQ... #if ($columns[3]!~/DRONE.*?DRONE/) { goto skiprow; } ################## ################## #clean up the sharing image... this eliminates a set of false positives... $match = lc($sharingimage); if ($match!~/button/ && $match!~/bttn/ && $match!~/\.gif$/ && $match!~/\bicon/ && $match!~/\banalytics\b/ && $match!~/template/ && $match!~/\bskins\b/ && $match!~/\bbanner/ && $match!~/default/ && $match!~/logo/ && $match!~/img_fb/ && $match!~/facebook/ && $match!~/figaro/ && $match!~/og\-angop/ && $match!~/gannett\-cdn/ && $match!~/no_preview/ && $match!~/strat_n/ && $match!~/risingkashmir/ && $match!~/nohotlinks/ && $match!~/og_image_meridianstar/ && $match!~/jdsupra\.com/ && $match!~/cubadebate-ipad/ && $match!~/fb_image/ && $match!~/top_stories_stopimg/ && $match!~/story-thumb-large/ && $match!~/g\-mtn\.png/ && $match!~/the_tribune_sq/ && $match!~/mynorthwest\.com/ && $match!~/apmobile/ && $match!~/\-square/ && $match!~/townnews\.com/ && length($match) < 1200) { } else { $sharingimage = ''; } if ($match=~/\.jpg/ || $match=~/\.png/) { } else { $sharingimage = ''; } #clean up URL and sharing image URL... backslashes cause CartoDB to fail on import... $domain=~s/\\/\//g; $url=~s/\\/\//g; $sharingimage=~s/\\/\//g; #do some other sanity checks on the the URL, domain, and sharing image... $domain=~s/['"]//g; $url=~s/['"]//g; $sharingimage=~s/['"]//g; #and clear out any illegal characters in any of them... this can break a few URLs (in reality this should have more complex logic to properly escape them), but is a quick and easy fix... $domain=~s/[^A-Za-z0-9\.\/\%\$\&\!\?\#\(\)\:\;\-\_=]//g; $url=~s/[^A-Za-z0-9\.\/\%\$\&\!\?\#\(\)\:\;\-\_=]//g; $sharingimage=~s/[^A-Za-z0-9\.\/\%\$\&\!\?\#\(\)\:\;\-\_=]//g; #and get the language of the article... $LANG = ''; ($LANG) = $translationinfo=~/srclc:([a-z]{3})/; if ($LANG eq '') { $LANG = 'eng'; } #and get its average basic tone and wordcount... $TONE = 0; $WORDCOUNT = 0; ($TONE, $WORDCOUNT) = $V2Tone=~/^([^,]+),.*,(\d+)$/; $TONE = sprintf("%0.2f", $TONE); ################## ################## #parse the themes list and hash up the offsets of all themes appearing in the article... skip the TAX_WORLDLANG field, since that is for more specialized queries - reenable if needed, but helps reduce memory footprint... undef(%themes); undef(@themeoffsets); undef(%themes_ownerlocid); undef(%themes_ownerlocdist); foreach $pair (split/;/, $V2Themes) { ($key, $offset) = split/,/, $pair; if ($key!~/TAX_WORLDLANG/) { $themes{$offset} = $key; } } @themeoffsets = sort keys %themes; ################## ################## #parse the locations list and hash up the offsets of all locations appearing in the article... undef(%locations_details); undef(%locations_human); undef(%locations_themes); foreach $block (split/;/, $V2Locations) { @vars = split/\#/, $block; ($type, $name, $lat, $long, $featureid, $offset) = ($vars[0], $vars[1], $vars[5], $vars[6], $vars[7], $vars[8]); if (($featureid eq 'US') || ($lat == 0 && $long == 0)) { goto skiptonextloc; } #skip country-level US hits... #we have a good location, so clean it up... $name=~s/[^A-Za-z\-, ]+//g; #get rid of bad characters... $lat = sprintf("%0.4f", $lat); $long = sprintf("%0.4f", $long); #cartodb can error if we have a flat number like "101" and so we need to cast all of them into floats like "101.0000" #hash up the details of this location... $geotype = 1; if ($type == 1) { $geotype = 1; } #country... elsif ($type == 2 || $type == 5) { $geotype = 2; } #2=usstate, 5=foreign state elsif ($type == 3 || $type == 4) { $geotype = 3; } #us or world city... $locations_details{$featureid} = "$name#$lat#$long#$geotype"; $locations_human{$name}++; #now compile a list of all of the themes that occurred in close proximity to this location mention... foreach $key (@themeoffsets) { if (abs($offset - $key) < 1000) { #1000 characters gives us a nice window - expand or tighten this as needed or desired... $dist = abs($offset - $key); if ($themes_ownerlocdist{$key} == 0 || $dist < $themes_ownerlocdist{$key}) { $themes_ownerlocdist{$key} = $dist; $themes_ownerlocid{$key} = $featureid; } } } skiptonextloc: } ################## ################## #now that we've processed all of the locations (so they've each had a chance to vie for ownership over each theme/name), now go back and formally build the list of themes for each location... foreach $key (@themeoffsets) { if (exists($themes_ownerlocid{$key})) { $locations_themes{$themes_ownerlocid{$key}} .= "$themes{$key};"; } } ################## ################## #and now write all of the locations... we do this way so that if a location is mentioned multiple times in an article (which is common), we only write it a single time to the file... #write this location out to the JSON file... $themelist = ''; $namelist = ''; foreach $featureid (keys %locations_details) { ($name, $lat, $long, $type) = split/\#/, $locations_details{$featureid}; ####### #clean up the names and themes lists to only include each mention a single time... we don't need to allow multiple mentions to be used for relevancy since we're already doing that by proximity... $themelist = ''; undef(%hash); foreach $key (split/;/, $locations_themes{$featureid}) { $hash{$key} = 1; }; foreach $key (keys %hash) { $themelist .= $key . ';'; } if (length($themelist) > 1) { $themelist = ';' . $themelist; } ####### #do some quick sanity cleaning to sweep out some things that upset CartoDB... $name=~s/[^A-Za-z0-9\-_;, ]//g; $themelist=~s/[^A-Za-z0-9\-_;, ]//g; #allow numbers for WB themes... #and clean out spans of repeated semicolons... $themelist=~s/;[;]+/;/g; ####### #lowercase the domain and lang... should already be lowercase, but just in case... $domain = lc($domain); $LANG = lc($LANG); ####### #and write our processed output... use the fields below if you were to want to import these results into your own database platform... #print OUT "$themelist\tdomain:$domain\tlang:$LANG\t$long, $lat\tgeoname:$name\t$type\t$url\t$sharingimage\t$TONE\t$WORDCOUNT\n"; #instead here we are going to cluster further into a single record per location for maximal disk minimization... and we're going to ignore most of the fields we extracted... if ($themelist=~/$THEME/) { #requires our theme to be present in this particular location mention (since most locations in an article are not found near discussion of our theme)... #print "$name\t$long,$lat\t$url\t$sharingimage\t$themelist\n"; $key = "$long, $lat"; $CLUSTER_CNT{$key}++; $CLUSTER_NAME{$key} = $name; if (length($sharingimage) > 7) { $CLUSTER_SHARINGIMG{$key} = $sharingimage; } $CLUSTER_DETAILS{$key} .= "Article Link
"; } } ################## skiprow: } close(FILE); ######################################################## #and finally write the results out... open(OUT, ">$INFILE.geojson"); print OUT "{ \"type\": \"FeatureCollection\", \"features\": [\n\n"; foreach $key (keys %CLUSTER_CNT) { #print OUT "$key\t$CLUSTER_CNT{$key}\t$CLUSTER_NAME{$key}\t$CLUSTER_SHARINGIMG{$key}\t$CLUSTER_DETAILS{$key}\n"; $urllinks = $CLUSTER_DETAILS{$key}; $urllinks=~s/"/\\"/g; if ($WROTEROWS > 0) { print OUT ",\n"; } print OUT "{ \"type\": \"Feature\", \"geometry\": { \"type\": \"Point\", \"coordinates\": [$key] }, \"properties\": { \"name\": \"$CLUSTER_NAME{$key}\", \"numarts\": $CLUSTER_CNT{$key}, \"sharingimage\": \"$CLUSTER_SHARINGIMG{$key}\", \"urllinks\": \"$urllinks\" } }"; $WROTEROWS++; } print OUT "\n\n] } "; close(OUT); ########################################################