Visualization blocks
大约 7 分钟
Visualization blocks
Histogram Plus
# Explore view_count
sns.distplot(data["view_count"],bins=50)

plt.clf()
ax = sns.distplot(np.log(data["view_count"]),hist=False, kde_kws={"shade": True})
ax.set(xlabel='ln(view_count)')
plt.show()

Q-Q Plot
# Explore data['likes']/data["view_count"] 热度 popularity
data["rate_likes"] = data['likes'] / data["view_count"]
from scipy.stats import kstest
res=kstest(data["rate_likes"], 'norm', (data["rate_likes"].mean(), data["rate_likes"].std()))
print(res)
plt.clf()
print(scipy.stats.normaltest(data["rate_likes"]))
scipy.stats.probplot(data["rate_likes"], dist="norm", plot=plt)
plt.title("Q-Q Plot for Video Like Rate")
plt.show()
<<<
KstestResult(statistic=0.07964721538565644, pvalue=8.813513831500901e-129)
NormaltestResult(statistic=6992.485911812571, pvalue=0.0)

Heat Map
# Exploring correlations between data
plt.clf()
fc = data.loc[data["dislikes"] >= 0, ["rate_likes", "view_count", "likes", "dislikes", "comment_count"]].corr()
mask = np.zeros_like(fc)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(fc,mask=mask,linewidths=.5,vmin=-1,vmax=1,annot=True,fmt='.2f',cmap=sns.color_palette('RdBu_r',n_colors=128))
plt.show()

Boxplot
# Observation and deleting abnormal rate_likes values
sns.boxplot(data["rate_likes"])
error = data[np.abs(data["rate_likes"] - data["rate_likes"].mean()) > 3 * data["rate_likes"].std()]
data = data.drop(error.index)

Pie chart
from pyecharts.charts import Line,Pie,Grid,Bar,Page
import pyecharts.options as opts
play_message = data.groupby(['channelTitle'])
play_com = play_message['channelTitle'].agg(['count']).sort_values('count',ascending = False)[:15]
play_com.reset_index(inplace=True)
attr = play_com['channelTitle']
v1 = play_com['count']
pie = Pie(init_opts=opts.InitOpts(width="1100px", height="600px"))
pie.add("", [list(z) for z in zip(attr, v1)], radius=["40%", "70%"])
pie.set_global_opts(title_opts=opts.TitleOpts(title="TOP15 Video Count Channel", pos_left="center", pos_top="top"),
legend_opts=opts.LegendOpts(orient="vertical", pos_left="1%",pos_bottom="20%"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}))
pie.set_series_opts(label_opts=opts.LabelOpts(is_show=True, formatter="{b}: {d}%"))
pie.render_notebook()

Cumulative Plot
item_cum=data['channelTitle'].value_counts().sort_values(ascending=False).cumsum()/len(data['view_count'])
x=range(len(item_cum)+1)
line7 = (
Line()
.add_xaxis(x)
.add_yaxis('Percentage of Cumulative view_count', item_cum.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='Trends in the Percentage of Cumulative view_count by Channel',pos_left="50%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='Percentage of Cumulative view_count'),
xaxis_opts=opts.AxisOpts(name='Number of Channel'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line7.render_notebook()

Pyecharts Hist
data[data["channelTitle"] == "MrBeast"]["view_count"].sum()
data2 = data.groupby('channelTitle')['view_count'].sum()
data2 = pd.DataFrame(data2.sort_values( ascending=False))
attr = data2.index[0:15]
v1 = [float('%.1f' % (float(i) / 1000000)) for i in data2['view_count'][0:15]]
bar = Bar(init_opts=opts.InitOpts(width="800px", height="400px"))
bar.add_xaxis(list(reversed(attr.tolist())))
bar.add_yaxis("", list(reversed(v1)),color = 'green')
bar.set_global_opts(title_opts=opts.TitleOpts(title="", pos_left="center", pos_top="18"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}}),
xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)))
bar.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="right", color="black"))
bar.reversal_axis()
bar.render_notebook()

Pyecharts Line
from pyecharts.charts import Line,Pie,Grid,Bar,Page
import pyecharts.options as opts
line1 = (
Line()
.add_xaxis(date_view.index.tolist())
.add_yaxis('视频平均播放量', date_view.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均播放量变化趋势',pos_left="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均播放量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line2 = (
Line()
.add_xaxis(date_comment.index.tolist())
.add_yaxis('视频平均评论数量', date_comment.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均评论数量变化趋势',pos_right="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均评论数量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line3 = (
Line()
.add_xaxis(date_likes.index.tolist())
.add_yaxis('视频平均点赞数', date_likes.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='视频平均点赞数变化趋势',pos_top="50%",pos_left="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='视频平均点赞数'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
line4 = (
Line()
.add_xaxis(date_count.index.tolist())
.add_yaxis('趋势视频数量', date_count.values.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title='趋势视频数量变化趋势',pos_top="50%",pos_right="20%"),
legend_opts=opts.LegendOpts(is_show=False),
yaxis_opts=opts.AxisOpts(name='趋势视频数量'),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature={"saveAsImage": {}})
)
.set_series_opts (label_opts=opts.LabelOpts(is_show=False))
)
grid1 = (
Grid()
.add(line1, grid_opts=opts.GridOpts(pos_bottom="60%",pos_right="55%"))
.add(line2, grid_opts=opts.GridOpts(pos_bottom="60%",pos_left="55%"))
.add(line3, grid_opts=opts.GridOpts(pos_top="60%",pos_right="55%"))
.add(line4, grid_opts=opts.GridOpts(pos_top="60%",pos_left="55%"))
)
grid1.render_notebook()
