by Pride Chamisa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import requests
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
%matplotlib inline
df_twitter = pd.read_csv('twitter-archive-enhanced.csv')
# Making the directory if it doesn't already exist
folder_name = 'image_predictions'
if not os.path.exists(folder_name):
os.makedirs(folder_name)
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(os.path.join(folder_name, url.split('/')[-1]), mode = 'wb') as file:
file.write(response.content)
df_predictions = pd.read_csv('image_predictions/image-predictions.tsv', sep='\t')
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
tweet_ids = df_1.tweet_id.values
len(tweet_ids)
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
# This loop will likely take 20-30 minutes to run because of Twitter's rate limit
for tweet_id in tweet_ids:
count += 1
print(str(count) + ": " + str(tweet_id))
try:
tweet = api.get_status(tweet_id, tweet_mode='extended')
print("Success")
json.dump(tweet._json, outfile)
outfile.write('\n')
except tweepy.TweepError as e:
print("Fail")
fails_dict[tweet_id] = e
pass
end = timer()
print(end - start)
print(fails_dict)
# Creating a list to hold the json data for all tweets
data = []
with open("tweet_json.txt", "r") as f:
for line in f:
data.append(json.loads(line))
# Creating another lists to hold the friends' and followers' counts
followers_count = []
friends_count = []
for i in range(len(data)):
followers_count.append(data[i]['user']['followers_count'])
friends_count.append(data[i]['user']['friends_count'])
df_json = pd.DataFrame(data, columns={'id_str','retweet_count','favorite_count'})
# Adding friends' and followers' counts to the dataframe
df_json['followers_count'] = followers_count
df_json['friends_count'] = friends_count
In this section, I will detect and document quality issues and tidiness issues using visual assessment and programmatic assessement to assess the data.
df_twitter
df_predictions
df_json
Dataset 1: Enhanced Twitter Archive
df_twitter.head()
df_twitter.name.value_counts()
df_twitter.info()
df_twitter.nunique()
sum(df_twitter.duplicated())
df_twitter.dtypes
Dataset 2: Image Predictions
df_predictions.head()
df_predictions.info()
df_predictions.dtypes
sum(df_predictions.jpg_url.duplicated())
Dataset 3 : Twitter json
df_json.head()
df_json.info()
df_json.describe()
df_json.dtypes
df_json.nunique()
There are retweets, and the columns: in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id', 'retweeted_status_user_id','retweeted_status_timestamp'
have plenty of empty values
timestamp
values have a +0000 attached
Erroneous datatype in tweet_id
and timestamp
Missing column for the fraction: rating_numerator / rating_denominator
Incorrect dog names listed as a
, the
and an
Source data has unwanted link tags
Dog names listed as None instead of NaN
The dog breed names are separated by a '_' instead of space
Duplicated urls in jpg_url
Naming convection of id_str
not consistent with tweet_id
The columns doggo, puppo, pupper, floofer should be combined into a single column
All the 3 tables should be one dataset
In this section, I will be cleaning of the issues documented while assessing.
# Make copies of original pieces of data
df_twitter_clean = df_twitter.copy()
df_predictions_clean = df_predictions.copy()
df_json_clean = df_json.copy()
df_twitter_clean.sample(4)
df_twitter_clean = df_twitter_clean[np.isnan(df_twitter_clean.retweeted_status_id)]
# Confirming null entries in retweeted status user id
df_twitter_clean.info()
Drop the multiple columns with plenty of empty values
df_twitter_clean = df_twitter_clean.drop(['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
'retweeted_status_user_id','retweeted_status_timestamp'],1)
# Confirming the dropped columns are nolonger part of the dataframe
df_twitter_clean.info()
timestamp
values have a +0000 attached¶# Slicing the last 5 values of timestamp
df_twitter_clean.timestamp = df_twitter_clean.timestamp.str[:-5]
# Confirming trailing zeros are removed
df_twitter_clean.sample(2)
tweet_id
and timestamp
¶df_twitter_clean.tweet_id = df_twitter_clean.tweet_id.astype(str)
df_predictions_clean.tweet_id = df_twitter_clean.tweet_id.astype(str)
df_twitter_clean['timestamp']= pd.to_datetime(df_twitter_clean['timestamp'])
# Confirming the datatype changes
df_twitter_clean.dtypes
df_twitter_clean['fraction'] = df_twitter_clean['rating_numerator'].astype(float)/df_twitter_clean['rating_denominator'].astype(float)
df_twitter_clean['fraction'].sample(3)
a
, the
and an
¶# https://github.com/tharcyla/wrangle-and-analyze-data/blob/main/wrangle_act.ipynb
# Getting the index of all wrong names
err = df_twitter_clean[df_twitter_clean['name'].str.contains(r'^[a-z]')]['name'].index
# Changing them to NaN
df_twitter_clean.loc[err, 'name'] = np.nan
# Confirming changes
df_twitter_clean['name'].value_counts()
df_twitter_clean.source = df_twitter_clean.source.str.extract('>([\w\W\s]*)<', expand=True)
# Confirming the link tags are removed
df_twitter_clean.source.sample(3)
err = df_twitter_clean.query('name == "None"').index
# Changing them to NaN
df_twitter_clean.loc[err, 'name'] = np.nan
df_twitter_clean['name'].sample(8)
df_predictions_clean['p1'] = df_predictions_clean['p1'].str.replace('_', ' ').str.title()
df_predictions_clean['p2'] = df_predictions_clean['p2'].str.replace('_', ' ').str.title()
df_predictions_clean['p3'] = df_predictions_clean['p3'].str.replace('_', ' ').str.title()
df_predictions_clean.sample(4)
df_predictions_clean = df_predictions_clean.drop_duplicates(subset=['jpg_url'], keep='last')
sum(df_predictions_clean['jpg_url'].duplicated())
df_json_clean.rename({'id_str':'tweet_id'}, axis=1, inplace=True)
df_json_clean.columns
# Creating a new 'dog_stage' column
df_twitter_clean['dog_stage'] = df_twitter_clean['text'].str.extract('(floofer|doggo|pupper|puppo)', expand=True)
#Changing its datatype
df_twitter_clean['dog_stage'] = df_twitter_clean['dog_stage'].astype('category')
# Deleting the previous columns
df_twitter_clean = df_twitter_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], 1)
df_twitter_clean['dog_stage'].value_counts()
df_json_clean.info()
df_all = pd.merge(df_twitter_clean, df_predictions_clean, how = 'inner', on = ['tweet_id'])
# merging with twitter json clean
df_all = pd.merge(df_all, df_json_clean, how = 'inner', on = ['tweet_id'])
# Confirming the fully merged dataset
df_all.info()
Saving gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".
df_all.to_csv('twitter_archive_master.csv', index=False)
In this section, I will be analyzing and visualizing the wrangled data.
# Importing the new stored master dataset
twitter_archive_master = pd.read_csv('twitter_archive_master.csv')
# Changing back datatypes
twitter_archive_master['timestamp']= pd.to_datetime(twitter_archive_master['timestamp'])
twitter_archive_master['source'] = twitter_archive_master['source'].astype('category')
twitter_archive_master['tweet_id'] = twitter_archive_master['tweet_id'].astype('str')
1. Most popular dog names
most_popular_name = twitter_archive_master.name.value_counts()[1:10].sort_values(ascending=False).plot(kind='barh')
plt.xlabel('Dog Names');
plt.ylabel('Number of Dogs');
plt.title('Most popular dog names', size=15);
plt.savefig('popular-dog-names');
From the bar graph, it is observed that Charlie and Lucy were the most popular dog names
2. The most liked dog stages
twitter_archive_master.groupby('dog_stage').favorite_count.mean().plot(kind='bar', title='Most liked dog stages');
plt.ylabel('Likes');
3. The most retweeted dog stages
twitter_archive_master.groupby('dog_stage').retweet_count.mean().plot(kind='bar', title='Most retweeted dog stages');
plt.ylabel('Retweets');
While puppo was the most liked dog stage, floofer was the most retweeted dog stage
4. The relationship between favourite_count and retweet_count
twitter_archive_master.plot(x='favorite_count', y='retweet_count', kind='scatter', title = 'relationship between favourite_count and retweet_count');
From the above graph, it is observed that favourite count is directly proportional to retweet count
5. The most common source of tweets
most_common_source = twitter_archive_master['source'].value_counts()[0:5]
most_common_source.plot(kind='pie', title='Most common sources', autopct='%.1f%%', figsize=(10,10));
The above pie-chart shows that 'Twitter for iPhone' was the most common source of tweets
Visualizing the distribution of tweets over time
twitter_archive_master['tweet_id'].groupby([twitter_archive_master['timestamp'].dt.month, twitter_archive_master['timestamp'].dt.month]).count().plot(kind='line');
plt.title('Distribution of tweets over time', size=13);
plt.xlabel('Time(Monthly Distribution)');
plt.ylabel('Tweets');
twitter_archive_master['timestamp'].dt.month.value_counts()
This shows that most tweets were tweeted in December followed by November and January