tweetSanitization.py
#!/usr/bin/python
#
import json
import time
#
fO = open("main_table.txt", "w")
with open('tweets_ds.json', 'r') as f:
    cnt=1
    for cnt, line in enumerate(f):
        tweet = json.loads(line) # load it as Python dict
    #print(json.dumps(tweet, indent=4)) # pretty-print
        try: 
            tweetCreated = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
        except:
            tweetCreated ='NULL'
        try:
            tweetID=str(tweet["id"])
        except:
            tweetID='0'
        try:
            tweetText = str(tweet["text"]).replace("\n","").replace("\r","").replace("\t","").replace(";","").strip()
        except:
            tweetText = 'NULL'
        try:
            display_text_range = str(tweet["display_text_range"]).strip()
        except: 
            display_text_range = 'NULL'
        try:
            source = str(tweet["source"])
        except: 
            source = 'NULL'
        try:
            truncated = str(tweet["truncated"])
        except: 
            truncated= 'NULL'
        try:
            in_reply_to_status_id = str(tweet["in_reply_to_status_id"])
        except: 
        	  in_reply_to_status_id= '0'
        try:
        	  in_reply_to_user_id = str(tweet["in_reply_to_user_id"])
        except: 
        	  in_reply_to_user_id= '0'
        try:
        	  in_reply_to_screen_name = str(tweet["in_reply_to_screen_name"]).strip()
        except: 
        	  in_reply_to_screen_name = 'NULL'
        try:
        	  tweetUser = str(tweet["user"]["id"])
        except: 
        	  tweetUser = '0'
        try:
        	 tweetGeoType = str(tweet["geo"]["type"]).strip()
        except: 
        	 tweetGeoType = 'NULL'
        try:
        	 tweetGeoCoord = str(tweet["geo"]["coordinates"]).strip()
        except: 
        	 tweetGeoCoord = 'NULL'
        try:
        	 tweetCoordType = str(tweet["coordinates"]["type"]).strip()
        except: 
        	 tweetCoordType = 'NULL'
        try:
        	 tweetCoord = str(tweet["coordinates"]["coordinates"]).strip()
        except: 
        	 tweetCoord = 'NULL'
        try:
        	 placeID = str(tweet["place"]["id"]).strip()
        except: 
        	 placeID = 'NULL'
        try:
        	 placeURL = str(tweet["place"]["url"]).strip()
        except: 
        	 placeURL = 'NULL'
        try:
        	 placeType = str(tweet["place"]["place_type"]).strip()
        except: 
        	 placeType = 'NULL'
        try:
        	 placeCountry = str(tweet["place"]["country"]).strip()
        except: 
        	 placeCountry = 'NULL'
        try: 
            tweetPFullName = str(tweet["place"]["full_name"]).strip()
        except:
            tweetPFullName = 'NULL'
        try: 
            tweetPCountryCode = str(tweet["place"]["country_code"]).strip()
        except:
            tweetPCountryCode ='NULL'
        try:
            tweetPname = str(tweet["place"]["name"]).strip()
        except:
            tweetPname = 'NULL'
        try:
            tweetPBBox1x= str(tweet["place"]["bounding_box"]["coordinates"][0][0][0])
        except:
            tweetPBBox1x='NULL'
        try:
            tweetPBBox1y= str(tweet["place"]["bounding_box"]["coordinates"][0][0][1])
        except:
            tweetPBBox1y='NULL'
        try:
            tweetPBBox2x= str(tweet["place"]["bounding_box"]["coordinates"][0][1][0])
        except:
            tweetPBBox2x='NULL'
        try:
            tweetPBBox2y= str(tweet["place"]["bounding_box"]["coordinates"][0][1][1])
        except:
            tweetPBBox2y='NULL'
        try:
            tweetPBBox3x= str(tweet["place"]["bounding_box"]["coordinates"][0][2][0])
        except:
            tweetPBBox3x='NULL'
        try:
            tweetPBBox3y= str(tweet["place"]["bounding_box"]["coordinates"][0][2][1])
        except:
            tweetPBBox3y='NULL'
        try:
            tweetPBBox4x= str(tweet["place"]["bounding_box"]["coordinates"][0][3][0])
        except:
            tweetPBBox4x='NULL'
        try:
            tweetPBBox4y= str(tweet["place"]["bounding_box"]["coordinates"][0][3][1])
        except:
            tweetPBBox4y='NULL'
        try:
        	  placeAttributes = str(tweet["place"]["attributes"]).strip()
        except: 
        	  placeAttributes = 'NULL'
        try:
        	  tweetContributors = str(tweet["place"]["contributors"]).strip()
        except: 
        	  tweetContributors = 'NULL'
        try:
        	  is_quote_status= str(tweet["is_quote_status"])
        except: 
        	  is_quote_status= 'NULL'
        try:
        	  retweetCount= str(tweet["retweet_count"])
        except: 
        	  retweetCount= '0'
        try:
        	  favoriteCount= str(tweet["favorite_count"])
        except: 
        	  favoriteCount= '0'
        try:
        	  entitiesID= str(tweet["entities"]["user_mentions"]["id"])
        except: 
        	  entitiesID = '0'
        try:
        	  entitiesMediaID= str(tweet["entities"]["media"]["id"])
        except: 
        	  entitiesMediaID = '0'
        try:
        	  extendedEntitiesID= str(tweet["extended_entities"]["media"]["id"])
        except: 
        	  extendedEntitiesID = '0'
        try:
        	  favoritedstr = (tweet["favorited"]).strip()
        except: 
        	  favorited = 'NULL'
        try:
        	  retweeted = (tweet["retweeted"]).strip()
        except: 
        	  retweeted = 'NULL'
        try:
        	  possibly_sensitive = (tweet["possibly_sensitive"]).strip()
        except: 
        	  possibly_sensitive = 'NULL'
        try:
        	  filter_level = (tweet["filter_level"]).strip()
        except: 
        	  filter_level = 'NULL'
        try:
        	  tweetLang = (tweet["lang"]).strip()
        except: 
        	  tweetLang = 'NULL'
        try:
        	  tweetTimestamp = (tweet["timestamp_ms"])
        except: 
        	  tweetTimestamp = '0'
        try:
        	  matchingRulesTag = (tweet["matching_rules"]["tag"]).strip()
        except: 
        	  matchingRulesTag = 'NULL'
        try:
        	  matchingRulesID = (tweet["matching_rules"]["id"])
        except: 
        	  matchingRulesID = '0'
 
        fO.write(tweetCreated + "\t" 
                 + tweetID + "\t" 
                 + tweetText + "\t"
                 + display_text_range + "\t"
                 + source + "\t"
                 + truncated + "\t"
                 + in_reply_to_status_id + "\t"
                 + in_reply_to_user_id + "\t"
                 + in_reply_to_screen_name + "\t"
                 + tweetUser + "\t"
                 + tweetGeoType + "\t"
                 + tweetGeoCoord + "\t"
                 + tweetCoordType + "\t"
                 + tweetCoord + "\t"
                 + placeID + "\t"
                 + placeURL + "\t"
                 + placeType + "\t"
                 + placeCountry + "\t"
                 + tweetPFullName + "\t" 
                 + tweetPCountryCode + "\t" 
                 + tweetPname + "\t" 
                 + tweetPBBox1x +"\t" 
                 + tweetPBBox1y  + "\t" 
                 + tweetPBBox2x + "\t" 
                 + tweetPBBox2y + "\t" 
                 + tweetPBBox3x + "\t" 
                 + tweetPBBox3y + "\t" 
                 + tweetPBBox4x + "\t" 
                 + tweetPBBox4y + "\t"
                 + placeAttributes + "\t"
                 + tweetContributors + "\t"
                 + is_quote_status + "\t"
                 + retweetCount + "\t"
                 + favoriteCount + "\t"
                 + entitiesID + "\t"
                 + entitiesMediaID + "\t"
                 + extendedEntitiesID + "\t"
                 + favorited + "\t"
                 + retweeted + "\t"
                 + possibly_sensitive + "\t"
                 + filter_level + "\t"
                 + tweetLang + "\t"
                 + tweetTimestamp + "\t"
                 + matchingRulesTag + "\t"
                 + matchingRulesID + "\n")
 
        cnt +=1
  • 01_30_2019.txt
  • Last modified: 2019/02/01 03:05
  • by cynthia.kahn