#!/usr/bin/perl $INFILE = $ARGV[0]; $NORMFILE = $ARGV[1]; if (!-e $INFILE || !-e $NORMFILE) { print "USAGE: ./parsebqcsvtogeo_countrydensitymap.pl INFILE NORMFILE\n"; exit; } ########################################################################## #load up our domain geocoding file from (http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT) if (!-e "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT") { print "FATAL: Please download \"http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT\" into the local directory!\n"; exit; } open(FILE, "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT"); while() { ($domain, $cc, $ccname) = split/\t/, $_; $ccname=~s/\s+$//; #do some renaming for CartoDB... if ($ccname eq 'Macau') { $ccname = 'China'; } if ($ccname eq 'Vietnam, Democratic Republic of') { $ccname = 'Vietnam'; } if ($ccname eq 'Czechoslovakia') { $ccname = 'Slovakia'; } $DOMAINTONAME{$domain} = $ccname; } close(FILE); ########################################################################## ########################################################################## #first handle our INFILE... open(FILE, $INFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($URL, $DOMAIN, $SHARINGIMAGE) $DOMAIN = $columns[0]; $COUNT = $columns[1]; ################## ################## $COUNTRY_CNT{$DOMAINTONAME{$DOMAIN}}+=$COUNT; ################## } ########################################################################## ########################################################################## #now handle our NORMFILE... open(FILE, $NORMFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($URL, $DOMAIN, $SHARINGIMAGE) $DOMAIN = $columns[0]; $CNT = $columns[1]; ################## $COUNTRY_NORMARTS{$DOMAINTONAME{$DOMAIN}}+=$CNT; } ########################################################################## ########################################################################## #and finally write our outfile TSV... delete $COUNTRY_CNT{''}; delete $COUNTRY_NORMARTS{''}; open(OUT, ">$INFILE.CARTODB.CSV"); print OUT "Country,Count,Percent\n"; foreach $cname (keys %COUNTRY_NORMARTS) { if ($COUNTRY_NORMARTS{$cname} >= 0) { $perc = sprintf("%0.4f", ($COUNTRY_CNT{$cname} / $COUNTRY_NORMARTS{$cname}) * 100); if ($perc <= 0) { $perc = 0.0; } print OUT "\"$cname\",$COUNTRY_CNT{$cname},$perc\n"; } } close(OUT); ##########################################################################