#!/usr/bin/perl $INFILE = $ARGV[0]; $NORMFILE = $ARGV[1]; if (!-e $INFILE || !-e $NORMFILE) { print "USAGE: ./parsebqcsvtogeojson_countrychoropleth.pl INFILE NORMFILE\n"; exit; } ########################################################################## #load up our domain geocoding file from (http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT) if (!-e "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT") { print "FATAL: Please download \"http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT\" into the local directory!\n"; exit; } open(FILE, "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT"); while() { ($domain, $cc, $ccname) = split/\t/, $_; $ccname=~s/\s+$//; #do some renaming for CartoDB... if ($ccname eq 'Macau') { $ccname = 'China'; } if ($ccname eq 'Vietnam, Democratic Republic of') { $ccname = 'Vietnam'; } if ($ccname eq 'Czechoslovakia') { $ccname = 'Slovakia'; } $DOMAINTONAME{$domain} = $ccname; } close(FILE); ########################################################################## ########################################################################## #first handle our INFILE... open(FILE, $INFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($URL, $DOMAIN, $SHARINGIMAGE) $URL = $columns[0]; $DOMAIN = $columns[1]; $SHARINGIMAGE = $columns[2]; $SHARINGIMAGE=~s/\s+$//; ################## ################## $cname = $DOMAINTONAME{$DOMAIN}; if ($COUNTRY_CNT{$cname} < 50) { $COUNTRY_URLLIST{$cname} .= "Article Link
"; if ($COUNTRY_SHARINGIMAGE{$cname} eq '') { $COUNTRY_SHARINGIMAGE{$cname} = $SHARINGIMAGE; } } $COUNTRY_CNT{$cname}++; #increment even if we are over our 50 for this country... ################## } ########################################################################## ########################################################################## #now handle our NORMFILE... open(FILE, $NORMFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($URL, $DOMAIN, $SHARINGIMAGE) $DOMAIN = $columns[0]; $CNT = $columns[1]; ################## $COUNTRY_NORMARTS{$DOMAINTONAME{$DOMAIN}}+=$CNT; } ########################################################################## ########################################################################## #and finally write our outfile TSV... delete $COUNTRY_CNT{''}; open(OUT, ">$INFILE.CARTODB.CSV"); print OUT "Country,Count,Percent,SharingImage,ArticleList\n"; foreach $cname (keys %COUNTRY_CNT) { $urllinks = $COUNTRY_URLLIST{$cname}; $urllinks=~s/"/""/g; $perc = sprintf("%0.4f", ($COUNTRY_CNT{$cname} / $COUNTRY_NORMARTS{$cname}) * 100); print OUT "\"$cname\",$COUNTRY_CNT{$cname},$perc,\"$COUNTRY_SHARINGIMAGE{$cname}\",\"$urllinks\"\n"; } close(OUT); ##########################################################################