#!/usr/bin/perl use Statistics::Zscore; my $z = Statistics::Zscore->new; $INFILE = $ARGV[0]; if (!-e $INFILE) { print "USAGE: ./parsebqcsvtogeojson_countrytimeline.pl INFILE\n"; exit; } ########################################################################## #load up our domain geocoding file from (http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT) if (!-e "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT") { print "FATAL: Please download \"http://data.gdeltproject.org/supportingdatasets/DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT\" into the local directory!\n"; exit; } open(FILE, "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT"); while() { ($domain, $cc, $ccname) = split/\t/, $_; $ccname=~s/\s+$//; #do some renaming for CartoDB... if ($ccname eq 'Macau') { $ccname = 'China'; } if ($ccname eq 'Vietnam, Democratic Republic of') { $ccname = 'Vietnam'; } if ($ccname eq 'Czechoslovakia') { $ccname = 'Slovakia'; } $ccname=~s/,/ /g; $DOMAINTONAME{$domain} = $ccname; } close(FILE); ########################################################################## ########################################################################## #first handle our INFILE... open(FILE, $INFILE); while() { ################## #from PERL Cookbook - parse a CSV file while properly handling quoted blocks with commas inside them... @columns = (); push(@columns ,$+) while $_ =~ m{ # The first part groups the phrase inside quotes "([^\"\\]*(?:\\.[^\"\\]*)*)",? | ([^,]+),? | , }gx; push(@columns, undef) if substr($_, -1,1) eq ','; #COLUMNS: ($URL, $DOMAIN, $SHARINGIMAGE) $DATEDAY = $columns[0]; $DOMAIN = $columns[1]; $COUNT = $columns[2]; $COUNT=~s/\s+$//; ################## ################## if ($COUNT > 0) { $COUNTRY_CNT{$DATEDAY}{$DOMAINTONAME{$DOMAIN}}+=$COUNT; $HASH_CCNAME{$DOMAINTONAME{$DOMAIN}}++; $HASH_DATE{$DATEDAY}++; } ################## } ########################################################################## ########################################################################## #loop through and find all of the countries with more than 100 monitored articles to cull the field down to those countries that will actually produce usable timelines... delete $HASH_CCNAME{''}; foreach $name (sort keys %HASH_CCNAME) { if ($HASH_CCNAME{$name} >= 50) { push(@CCNAMES, $name); } } #and make an array out of the dates... foreach $date (sort keys %HASH_DATE) { push(@DATES, $date); } $DATECNT = scalar(@DATES); ########################################################################## ########################################################################## #and finally write our outfile TSV... first do the raw counts version... open(OUT, ">$INFILE.TIMELINE-RAW.CSV"); print OUT "Date"; foreach $ccname (@CCNAMES) { print OUT ",$ccname"; }; print OUT "\n"; #print header row... foreach $date (@DATES) { $ROW = $date; foreach $ccname (@CCNAMES) { $cnt = $COUNTRY_CNT{$date}{$ccname}; if ($cnt < 1) { $cnt = 0; } $ROW .= ",$cnt"; } print OUT "$ROW\n"; } close(OUT); ########################################################################## ########################################################################## #calculate all of the Zscores... foreach $ccname (@CCNAMES) { undef(@arr); undef($zscore); foreach $date (@DATES) { $cnt = $COUNTRY_CNT{$date}{$ccname}; if ($cnt < 1) { $cnt = 0; } push(@arr, $cnt); } $zscore = $z->standardize( \@arr ); for($i=0;$i<$DATECNT;$i++) { $COUNTRY_CNTZ{$DATES[$i]}{$ccname} = ${$zscore}[$i]; } } ########################################################################## ########################################################################## #and finally write our ZScore TSV file out... open(OUT, ">$INFILE.TIMELINE-ZSCORE.CSV"); print OUT "Date"; foreach $ccname (@CCNAMES) { print OUT ",$ccname"; }; print OUT "\n"; #print header row... foreach $date (@DATES) { #construct $ROW = $date; foreach $ccname (@CCNAMES) { $cnt = $COUNTRY_CNTZ{$date}{$ccname}; $ROW .= ",$cnt"; } print OUT "$ROW\n"; } close(OUT); ##########################################################################