Projects

Abebawu Yigezu

Author: Abebawu Yigezu

Experience: Extensive expertise in Machine Learning, Natural Language Processing (NLP), and Data Science, with a strong focus on AdTech, data analytics, and data engineering. I have led and contributed to numerous projects involving real-time data processing, campaign optimization, and advanced AI-driven solutions in the advertising technology space, delivering impactful results and insights through cutting-edge techniques.

This notebook is to show how to work with teets to get top referenced users, most retweeted tweets, etc.

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import collections
import tweepy as tw
tweets_file = 'data/ethiopia_4e3.json'

tweets_data = []
tweets_file = open(tweets_file, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
import re
def get_hashtags(text):
    return re.findall(r'\B#\w*[a-zA-Z]+\w*', text)
hashtags=[]
for tweet in range(0,len(tweets_data)):
    try:
        hashtags.extend(get_hashtags(tweets_data[tweet]['text']))
        hashtags.extend(get_hashtags(tweets_data[tweet]['retweeted_status']['text']))
    except:
        pass
counts_no_hashtags = collections.Counter(hashtags)
print(counts_no_hashtags.most_common(5))
top_hashtags = pd.DataFrame(counts_no_hashtags.most_common(5),columns=['hashtags', 'count'])
fig, ax = plt.subplots(figsize=(8, 8))
top_hashtags.sort_values(by='count').plot.barh(x='hashtags',
                      y='count',
                      ax=ax,
                      color="pink")

ax.set_title("Top 5 hashtags",fontsize=15, fontweight='bold')

plt.show()
[('#TrumpIsALiar', 161), ('#Trump', 128), ('#ImpeachTrump', 101), ('#MAGA', 68), ('#TheResistance', 54)]

png

all_user_in_tweet=[]
for tweet in range(0,len(tweets_data)):
    try:
        all_user_in_tweet.append(tweets_data[tweet]['user']['name'])
    except KeyError:
        pass
counts_no_users = collections.Counter(all_user_in_tweet)
print(counts_no_users.most_common(5))

top_referenced_users = pd.DataFrame(counts_no_users.most_common(5),columns=['users', 'count'])
fig, ax = plt.subplots(figsize=(8, 8))
top_referenced_users.sort_values(by='count').plot.barh(x='users',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Top 5 referenced users",fontsize=15, fontweight='bold')

plt.show()
[('Deplorable David', 10), ('Karen simpson', 10), ('✨ tufts family ✨', 8), ('Barbara Mc', 8), ('BBrom', 8)]

png

retweeted_tweets=[]
for tweet in range(0,len(tweets_data)):
    try:
        retweeted_tweets.append(tweets_data[tweet]['retweeted_status']['id'])
    except KeyError:
        pass
most_retweeted_tweets = collections.Counter(retweeted_tweets)
print(most_retweeted_tweets.most_common(5))

most_retweeted_tweets = pd.DataFrame(most_retweeted_tweets.most_common(5),columns=['tweet_id', 'count'])
fig, ax = plt.subplots(figsize=(8, 8))
most_retweeted_tweets.sort_values(by='count').plot.barh(x='tweet_id',
                      y='count',
                      ax=ax,
                      color="blue")

ax.set_title("Top 5 retweeted tweets",fontsize=15, fontweight='bold')

plt.show()
[(886631382231003136, 39), (886631577098256384, 38), (886563930147098624, 34), (886632534968283138, 28), (886622966250950656, 25)]

png