diff --git a/data_acquisition/README.md b/data_acquisition/README.md index f7a9004..fef9674 100644 --- a/data_acquisition/README.md +++ b/data_acquisition/README.md @@ -50,10 +50,17 @@ You will get four output files: 1. a hydrated_tweets.zip file which contains a zipped version of the tweets_full.json file. 1. a hydrated_tweets_short.json which contains a shortened version of the hydrated tweets. -**streaming.py** This utility fetches all the available tweets from the Twitter Stream. This utility does not take any search terms. Instead, it downloads tweets each day. The purpose of this is to leave the tool running on a separate process and constantly collect tweets for a period of time. +**streaming.py** This utility fetches all the available tweets from the Twitter Stream. This utility does not take any search terms. Instead, it downloads tweets each day. The purpose of this is to leave the tool running on a separate process and constantly collect tweets for a period of time. The output of this utility is a json file with the 1% streaming sample with date of the day as the filename. This utility takes an argument to determine whether to compress the downloaded json file or not. If compression option is enabled, the json file is compressed in .zip format and file compression is carried out only after the day changes. +The compression wont work if the utility is terminated externally. + +**Note** - If you would like to compress the json file, you will have to send "compress" in the argument. ``` -Usage : python streaming.py +Usage +python streaming.py nocompress #does not compress the json file +python streaming.py compress #compresses the json file + + ``` ### Output: You will get one json file per day with the date as prefix of the file. \ No newline at end of file diff --git a/data_acquisition/streaming.py b/data_acquisition/streaming.py index 0fd670d..98ab1a0 100644 --- a/data_acquisition/streaming.py +++ b/data_acquisition/streaming.py @@ -6,6 +6,8 @@ import json import time import sys +import zipfile +import zlib try: import smtplib @@ -17,6 +19,7 @@ # Within the output directory, the script loads a file named FILTER with the terms to be tracked (one per line) outputDir = "outputDir" +compress_status = sys.argv[1] ## End of Settings### @@ -51,6 +54,14 @@ def close(self): except: #Log/email pass + + def compress(self): + print("compressing the json file") + output_file_noformat = self.filename.split(".",maxsplit=1)[0] + compression = zipfile.ZIP_DEFLATED + zf = zipfile.ZipFile('{}.zip'.format(outputDir+"/"+output_file_noformat), mode='w') + zf.write(outputDir+"/"+self.filename, compress_type=compression) + zf.close() #Rotate the log file if needed. #Warning: Check for log rotation only occurs when a tweet is received and not more than once every five minutes. @@ -60,6 +71,8 @@ def rotateFiles(self): filenow = "%i-%02d-%02d.json"%(d.year,d.month,d.day) if (self.filename!=filenow): print("%s - Rotating log file. Old: %s New: %s"%(datetime.now(),self.filename,filenow)) + if compress_status == "compress": + self.compress() try: self.fh.close() except: