#!/usr/bin/perl #use like: time ./makefetchcmds.pl 20200101 20200818 [ESCAPEDQUERY] #the query portion should be the entire contents of the URL parameter "query" ala: %20(ocr:"REALDONALDTRUMP"%20OR%20ocr:"REALDONALD")%20%20(station:KGO%20OR%20station:KPIX%20OR%20station:KNTV%20OR%20station:CNN%20OR%20station:MSNBC%20OR%20station:FOXNEWS%20OR%20station:BBCNEWS%20)%20 ./RESULTS-TRUMPTWEETS-20200101-20200817.CSV #RESULTS HAVE FORMAT: iaClipUrl,date,station,show,iaShowId,iaThumbnailUrl,ocr,asr,caption,captionnlp,visualEntities $START = $ARGV[0]; if ($START!~/^\d\d\d\d\d\d\d\d$/) { print "ERROR: Invalid Start Date\n"; exit; } $END = $ARGV[1]; if ($END!~/^\d\d\d\d\d\d\d\d$/) { print "ERROR: Invalid End Date\n"; exit; } $QUERY = $ARGV[2]; if (length($QUERY) < 5) { print "ERROR: Invalid Query\n"; exit; } $QUERY=~s/\"/\\"/g; $OUTFILE = $ARGV[3]; if (length($OUTFILE) < 1) { print "ERROR: Invalid Outfile\n"; exit; } #fetch the timeline... $timeline = ''; $timeline = `curl -s \"https://api.gdeltproject.org/api/v2/tvai/tvai?startdatetime=${START}000000&enddatetime=${END}235959&mode=timelinevol&format=csv&query=$QUERY\"`; #now loop over all of the days and sum the total hits per day, grouping it to X hits per day... $BLOCK_STARTDATE = 0; $BLOCK_CNT = 0; foreach $day (split/\n/, $timeline) { ($date, @counts) = split/,/, $day; if ($date!~/^\d\d\d\d\-\d\d/) { next; } #skip header... $date=~s/\-//g; $cnt = 0; foreach $val (@counts) { $cnt+=$val; } #print "($date)($cnt)\n"; #set our start date at the beginning to the first non-zero date... if ($BLOCK_STARTDATE < 1 && $cnt > 0) { $BLOCK_STARTDATE = $date; } ########## if ( ($BLOCK_CNT + $cnt) < 4900) { #adding this date to our current block is ok... $BLOCK_CNT += $cnt; } else { #write the current block out and reset... print "Downloading $BLOCK_STARTDATE - $LASTDATE ($BLOCK_CNT results)\n"; system("curl -s \"https://api.gdeltproject.org/api/v2/tvai/tvai?startdatetime=${BLOCK_STARTDATE}000000&enddatetime=${LASTDATE}235959&mode=clipgallery&format=csv&maxrecords=5000&sort=dateasc&query=$QUERY\" | tail -n +2 >> $OUTFILE"); sleep 5; #and reset... $BLOCK_STARTDATE = $date; $BLOCK_CNT = $cnt; } ########## #set our last date... $LASTDATE = $date; } #and add the last block to the file... if ($BLOCK_CNT > 0) { print "Downloading $BLOCK_STARTDATE - $LASTDATE ($BLOCK_CNT results)\n"; system("curl -s \"https://api.gdeltproject.org/api/v2/tvai/tvai?startdatetime=${BLOCK_STARTDATE}000000&enddatetime=${LASTDATE}235959&mode=clipgallery&format=csv&maxrecords=5000&sort=dateasc&query=$QUERY\" | tail -n +2 >> $OUTFILE"); sleep 5; } print "Done...\n\n";