import warnings
import pandas as pd
warnings.filterwarnings('ignore')
# Read Data
data = pd.read_csv("US_youtube_trending_data.csv")
# Preview Data
data.head()
video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3C66w5Z0ixs | I ASKED HER TO BE MY GIRLFRIEND... | 2020-08-11T19:20:14Z | UCvtRTOMP2TqYqu51xNrqAzg | Brawadis | 22 | 2020-08-12T00:00:00Z | brawadis|prank|basketball|skits|ghost|funny vi... | 1514614 | 156908 | 5855 | 35313 | https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg | False | False | SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib... |
1 | M9Pmf9AB4Mo | Apex Legends | Stories from the Outlands – “Th... | 2020-08-11T17:00:10Z | UC0ZV6M2THA81QT9hrVWJG3A | Apex Legends | 20 | 2020-08-12T00:00:00Z | Apex Legends|Apex Legends characters|new Apex ... | 2381688 | 146739 | 2794 | 16549 | https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg | False | False | While running her own modding shop, Ramya Pare... |
2 | J78aPJ3VyNs | I left youtube for a month and THIS is what ha... | 2020-08-11T16:34:06Z | UCYzPXprvl5Y-Sf0g4vX-m6g | jacksepticeye | 24 | 2020-08-12T00:00:00Z | jacksepticeye|funny|funny meme|memes|jacksepti... | 2038853 | 353787 | 2628 | 40221 | https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg | False | False | I left youtube for a month and this is what ha... |
3 | kXLn3HkpjaA | XXL 2020 Freshman Class Revealed - Official An... | 2020-08-11T16:38:55Z | UCbg_UMjlHJg_19SZckaKajg | XXL | 10 | 2020-08-12T00:00:00Z | xxl freshman|xxl freshmen|2020 xxl freshman|20... | 496771 | 23251 | 1856 | 7647 | https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg | False | False | Subscribe to XXL → http://bit.ly/subscribe-xxl... |
4 | VIUo6yapDbc | Ultimate DIY Home Movie Theater for The LaBran... | 2020-08-11T15:10:05Z | UCDVPcEbVLQgLZX0Rt6jo34A | Mr. Kate | 26 | 2020-08-12T00:00:00Z | The LaBrant Family|DIY|Interior Design|Makeove... | 1123889 | 45802 | 964 | 2196 | https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg | False | False | Transforming The LaBrant Family's empty white ... |
# drop useless column
data = data.drop(["thumbnail_link", "comments_disabled", "ratings_disabled", "description"], axis=1)
data = data.drop_duplicates(subset = "video_id" ,keep="last")
# Check null value
data.isnull().sum()
video_id 0 title 0 publishedAt 0 channelId 0 channelTitle 0 categoryId 0 trending_date 0 tags 0 view_count 0 likes 0 dislikes 0 comment_count 0 dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import numpy as np
# drop view_count = 0
print(len(data[data["view_count"] <= 0]))
print(len(data[data["likes"] <= 0]))
print(len(data[data["comment_count"] <= 0]))
print(len(data[data["dislikes"] <= 0]))
data = data.drop(data[data["likes"] <= 0].index, axis=0)
data = data.drop(data[data["comment_count"] <= 0].index, axis=0)
data = data.drop(data[data["view_count"] <= 0].index, axis=0)
print(data.columns)
data[["view_count","likes","comment_count"]].describe()
12 182 437 6475 Index(['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle', 'categoryId', 'trending_date', 'tags', 'view_count', 'likes', 'dislikes', 'comment_count'], dtype='object')
view_count | likes | comment_count | |
---|---|---|---|
count | 2.325800e+04 | 2.325800e+04 | 2.325800e+04 |
mean | 2.821533e+06 | 1.434977e+05 | 1.124083e+04 |
std | 6.934122e+06 | 4.003858e+05 | 8.638233e+04 |
min | 4.347200e+04 | 1.700000e+01 | 4.000000e+00 |
25% | 5.898188e+05 | 2.148550e+04 | 1.513250e+03 |
50% | 1.202252e+06 | 5.157550e+04 | 3.352000e+03 |
75% | 2.610916e+06 | 1.295885e+05 | 7.777500e+03 |
max | 2.644074e+08 | 1.602153e+07 | 6.738537e+06 |
# Explore view_count
sns.distplot(data["view_count"],bins=50)
<AxesSubplot:xlabel='view_count', ylabel='Density'>
# Explore ln(view_count)
plt.clf()
ax = sns.distplot(np.log(data["view_count"]),hist=False, kde_kws={"shade": True})
ax.set(xlabel='ln(view_count)')
plt.show()
# Explore data['likes']/data["view_count"] 热度 popularity
data["rate_likes"] = data['likes'] / data["view_count"]
from scipy.stats import kstest
res=kstest(data["rate_likes"], 'norm', (data["rate_likes"].mean(), data["rate_likes"].std()))
print(res)
plt.clf()
print(scipy.stats.normaltest(data["rate_likes"]))
scipy.stats.probplot(data["rate_likes"], dist="norm", plot=plt)
plt.title("Q-Q Plot for Video Like Rate")
plt.show()
sns.distplot(data["rate_likes"], hist=True, bins=40)
KstestResult(statistic=0.07964721538565644, pvalue=8.813513831500901e-129) NormaltestResult(statistic=6992.485911812571, pvalue=0.0)
<AxesSubplot:xlabel='rate_likes', ylabel='Density'>
# Exploring correlations between data
plt.clf()
fc = data.loc[data["dislikes"] >= 0, ["rate_likes", "view_count", "likes", "dislikes", "comment_count"]].corr()
mask = np.zeros_like(fc)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(fc,mask=mask,linewidths=.5,vmin=-1,vmax=1,annot=True,fmt='.2f',cmap=sns.color_palette('RdBu_r',n_colors=128))
plt.show()
sns.lmplot(y = "view_count",x = 'likes', data = data)
plt.show()
# Observation and deleting abnormal rate_likes values
sns.boxplot(data["rate_likes"])
# error = data[np.abs(data["rate_likes"] - data["rate_likes"].mean()) > 3 * data["rate_likes"].std()]
# data = data.drop(error.index)
<AxesSubplot:xlabel='rate_likes'>
from pyecharts.charts import Line,Pie,Grid,Bar,Page
import pyecharts.options as opts
play_message = data.groupby(['channelTitle'])
play_com = play_message['channelTitle'].agg(['count']).sort_values('count',ascending = False)[:15]
play_com.reset_index(inplace=True)
attr = play_com['channelTitle']
v1 = play_com['count']
pie = Pie(init_opts=opts.InitOpts(width="1100px", height="600px"))
pie.add("", [list(z) for z in zip(attr, v1)], radius=["40%", "70%"])
pie.set_global_opts(title_opts=opts.TitleOpts(title="TOP15 Video Count Channel", pos_left="center", pos_top="top"),
legend_opts=opts.LegendOpts(orient="vertical", pos_left="1%",pos_bottom="20%"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}))
pie.set_series_opts(label_opts=opts.LabelOpts(is_show=True, formatter="{b}: {d}%"))
pie.render_notebook()
item_cum=data['channelTitle'].value_counts().sort_values(ascending=False).cumsum()/len(data['view_count'])
x=range(len(item_cum)+1)
line7 = (
Line()
.add_xaxis(x)
.add_yaxis('Percentage of Cumulative view_count', item_cum.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='Trends in the Percentage of Cumulative view_count by Channel',pos_left="50%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='Percentage of Cumulative view_count'),
xaxis_opts=opts.AxisOpts(name='Number of Channel'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line7.render_notebook()
data[data["channelTitle"] == "MrBeast"]["view_count"].sum()
data2 = data.groupby('channelTitle')['view_count'].sum()
data2 = pd.DataFrame(data2.sort_values( ascending=False))
attr = data2.index[0:15]
v1 = [float('%.1f' % (float(i) / 1000000)) for i in data2['view_count'][0:15]]
bar = Bar(init_opts=opts.InitOpts(width="800px", height="400px"))
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = 'green')
bar.set_global_opts(title_opts=opts.TitleOpts(title="", pos_left="center", pos_top="18"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()
# Fill category
import json
category_rp = dict()
with open("US_category_id.json",'r') as f:
category_dict = json.load(f)["items"]
for i in category_dict:
category_rp[int(i["id"])] = i["snippet"]["title"]
print(category_rp)
data['categoryId'] = data['categoryId'].replace(category_rp)
data.rename(columns={'categoryId':'category'},inplace=True)
data.head()
{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 29: 'Nonprofits & Activism', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers'}
video_id | title | publishedAt | channelId | channelTitle | category | trending_date | tags | view_count | likes | dislikes | comment_count | rate_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
172 | cAtazIk1IYw | How To Make a Curried Egg Sandwich | 2020-08-07T18:30:06Z | UCR4s1DE9J4DHzZYXMltSMAg | HowToBasic | Howto & Style | 2020-08-12T00:00:00Z | how to make a curried egg sandwich|curried egg... | 1238677 | 104736 | 3736 | 13876 | 0.084555 |
173 | NYFHnIiA8gE | Cake Rescue Fixing Viral Cake Fails | How To C... | 2020-08-07T09:30:04Z | UCsP7Bpw36J666Fct5M8u-ZA | How To Cook That | Entertainment | 2020-08-12T00:00:00Z | cake rescue|caek fail|viral cake fails|funny c... | 938198 | 44088 | 565 | 2409 | 0.046992 |
174 | czwejgoH3zs | Son, lemme teach you something new | 2020-08-06T19:47:12Z | UCw03U5DZGLqvv5elJvXvR0Q | Bread Boys | Entertainment | 2020-08-12T00:00:00Z | [None] | 1722152 | 169501 | 927 | 7263 | 0.098424 |
175 | dO6YihaqtaQ | Trump takes executive action to address econom... | 2020-08-09T01:35:42Z | UCBi2mrWuNuyYy4gbM6fU18Q | ABC News | News & Politics | 2020-08-12T00:00:00Z | president|trump|donald|executive|orders|stimul... | 1090847 | 10922 | 2517 | 9876 | 0.010012 |
176 | sSjtGqRXQ9Y | JUDAS AND THE BLACK MESSIAH - Official Trailer | 2020-08-06T23:01:42Z | UCjmJDM5pRKbUlVIzDYYWb6g | Warner Bros. Pictures | Entertainment | 2020-08-12T00:00:00Z | warner bros|warner brothers|wb|fred hampton|wi... | 971704 | 23311 | 1987 | 3240 | 0.023990 |
# Explore category
play_message = data.groupby(['category'])
play_com = play_message['view_count'].agg(['sum']).sort_values('sum',ascending = False)[:10]
play_com.reset_index(inplace=True)
attr = play_com['category']
v1 = play_com['sum']
pie = Pie(init_opts=opts.InitOpts(width="1100px", height="600px"))
pie.add("", [list(z) for z in zip(attr, v1)], radius=["0%", "70%"])
pie.set_global_opts(title_opts=opts.TitleOpts(title="T", pos_left="center", pos_top="top"),
legend_opts=opts.LegendOpts(orient="vertical", pos_left="1%",pos_bottom="30%"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}))
pie.set_series_opts(label_opts=opts.LabelOpts(is_show=True, formatter="{b}: {d}%"))
pie.render_notebook()
# Exploring date
# trending_date modified
data["trending_month"] = data["trending_date"].astype("str").str.slice(0,7)
date_count = data.groupby("trending_month").size()
date_view_all = data.groupby("trending_month")["view_count"].sum()
date_view = data.groupby("trending_month")["view_count"].sum() / date_count
date_likes = data.groupby("trending_month")["likes"].sum() / date_count
date_dislikes = data.groupby("trending_month")["dislikes"].sum() / date_count
date_comment = data.groupby("trending_month")["comment_count"].sum() / date_count
date_view.index = date_view.index.astype("str")
date_view.index
Index(['2020-08', '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-05'], dtype='object', name='trending_month')
from pyecharts.charts import Line,Pie,Grid,Bar,Page
import pyecharts.options as opts
line1 = (
Line()
.add_xaxis(date_view.index.tolist())
.add_yaxis('视频平均播放量', date_view.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均播放量变化趋势',pos_left="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均播放量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line2 = (
Line()
.add_xaxis(date_comment.index.tolist())
.add_yaxis('视频平均评论数量', date_comment.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均评论数量变化趋势',pos_right="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均评论数量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line3 = (
Line()
.add_xaxis(date_likes.index.tolist())
.add_yaxis('视频平均点赞数', date_likes.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均点赞数变化趋势',pos_top="50%",pos_left="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均点赞数'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line4 = (
Line()
.add_xaxis(date_count.index.tolist())
.add_yaxis('趋势视频数量', date_count.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='趋势视频数量变化趋势',pos_top="50%",pos_right="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='趋势视频数量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
grid1 = (
Grid()
.add(line1, grid_opts=opts.GridOpts(pos_bottom="60%",pos_right="55%"))
.add(line2, grid_opts=opts.GridOpts(pos_bottom="60%",pos_left="55%"))
.add(line3, grid_opts=opts.GridOpts(pos_top="60%",pos_right="55%"))
.add(line4, grid_opts=opts.GridOpts(pos_top="60%",pos_left="55%"))
)
grid1.render_notebook()