数据分析案例
数据来源:kaggle的TMDB 5000 Movie Dataset数据集
import numpy as np import pandas as pd import json import matplotlib.pyplot as plt # 一、准备操作 # 1. 读取文件 data_credits = pd.read_csv('tmdb_5000_credits.csv',encoding='utf-8') data_movies = pd.read_csv('tmdb_5000_movies.csv',encoding='utf-8') # 2. 查看表的信息 print(data_credits.info(),data_movies.info()) ''' data_credicts: # Column Non-Null Count Dtype --- ------ -------------- ----- 0 movie_id 4803 non-null int64 1 title 4803 non-null object 2 cast 4803 non-null object #演员列表 3 crew 4803 non-null object data_movies: # Column Non-Null Count Dtype --- ------ -------------- ----- 0 budget 4803 non-null int64 1 genres 4803 non-null object #风格 2 homepage 1712 non-null object 3 id 4803 non-null int64 4 keywords 4803 non-null object 5 original_language 4803 non-null object 6 original_title 4803 non-null object 7 overview 4800 non-null object 8 popularity 4803 non-null float64 9 production_companies 4803 non-null object 10 production_countries 4803 non-null object 11 release_date 4802 non-null object 12 revenue 4803 non-null int64 13 runtime 4801 non-null float64 #电影时长 14 spoken_languages 4803 non-null object 15 status 4803 non-null object 16 tagline 3959 non-null object 17 title 4803 non-null object 18 vote_average 4803 non-null float64 19 vote_count 4803 non-null int64 ''' # 3. 表的合并 # 以credits中的movie_id和movies中的id为主键进行合并 data = pd.merge(data_credits, data_movies, left_on = ['movie_id','title'], right_on = ['id','title'], how='outer') print(data.info()) # 二.数据分析 # *1. 电影类型如何随着时间的推移而变化? genres/release_date* # 由info()可知release_date数据量只有4802 说明有一个空值 # 1)查找空值所在记录的电影名称 [4553 America Is Still the Place] print(data.loc[data['release_date'].isnull(),'title']) # 2)百度该电影上映日期 2014-06-01 将其填入空值 data['release_date'].fillna('2014-06-01',inplace=True) # print(data.info()) # release_date 有4803行 填入成功 # 3)取出所有的电影类型: # 在genres中 字典是以字符串的形式存放的: # print(type(data['genres'][0])) <class 'str'> # 因此我们需要把字符串转变为字典类型 这里使用到一个方法:json.loads # 参考:https://www.php.cn/python-tutorials-424994.html data['genres'] = data['genres'].transform(json.loads) # print(type(data['genres'][0])) #<class 'list'> # 再把列表中的每一个字典中name键对应的值取出来 type_list = [] for i in range(data['genres'].size): #对每一行数据进行遍历 for dict in data['genres'][i]: #对每一行的列表中每一个字典进行遍历 if len(dict) != 0: #当列表不为空时 type = dict['name'] #取出字典中name的值 type_list.append(type) message = dict['name'] data.loc[i,['genres']] = ''.join(message) #把列表嵌套的字典类型转换为字符串类型 type_set = set(type_list) #通过集合去重 print(type_set) print(data['genres']) for j in range(data['genres'].size): for dict2 in data['genres'][j]: if len(dict1) != 0: message = dict2['name'] data['genres'][j] = ''.join(message) # 4)把上映时间的字符串类型转换为标准的日期类型 data['release_year'] = pd.to_datetime(data['release_date']).dt.year #新增一列上映年份的列 # 5)字符串操作 for j in list(type_set): data[j] = data['genres'].str.contains(j).transform(lambda x:1 if x else 0) #某种电影是否包含该类型 并将布尔值转换为数值 # print(data.shape) # 6)分组 groupby_year = data.groupby(by='release_year')[list(type_set)].sum() # 7)可视化 plt.figure() #修改字体 plt.rcParams['font.sans-serif'] = 'SimHei' #正常显示符号问题 plt.rcParams['axes.unicode_minus'] = False x = groupby_year.index for type in list(type_set): y = groupby_year[type] print(type) plt.plot(x, y) plt.legend(labels=type) plt.show() #不同类型的电影总数占比 #不同类型的电影总数 # print(data[list(type_set)].sum().sort_values(ascending=False)) #画饼图 x1 = data[list(type_set)].sum().sort_values(ascending=False) plt.figure() plt.pie(x1,labels=list(type_set),autopct='%1.1f%%',pctdistance=0.5) plt.show() #画柱状图 plt.figure() x2 = list(type_set) y = data[list(type_set)].sum().sort_value() plt.barh(x2,y) plt.show() *#2.电影类型与利润的关系* #1)计算利润 data['profit'] = data['revenue'] - data['budget'] #2) 每种类型的电影的平均利润 movie_type_profit = [] for movie_type in list(type_set): # print(data.loc[data[movie_type]==1,'profit']) profit_avg = data.loc[data[movie_type]==1,'profit'].mean() movie_type_profit.append(profit_avg) #构建DataFrame表格 movie = pd.DataFrame({'movie_type':list(type_set),'movie_profit':movie_type_profit}) print(movie) #可视化 plt.figure() plt.barh(list(type_set),movie_type_profit) plt.show() # 3.Universal和paramount两家公司的对比情况 #在表中找到全称‘Universal Pictures’和‘Paramount Pictures’ 字段名为‘production companies’ #1)构建两列数据,代表两个公司 data['Universal Pictures'] = data['production_companies'].str.contains('Universal Pictures') data['Paramount Pictures'] = data['production_companies'].str.contains('Paramount Pictures') # print(data['Universal Pictures']) 返回布尔值 #2)对比盈利情况 U_profit = data.loc[data['Universal Pictures'],'profit'].mean() P_profit = data.loc[data['Paramount Pictures'],'profit'].mean() # print(U_profit,P_profit) # 3) 发行影片的情况 U_sum = data['Universal Pictures'].sum() P_sum = data['Paramount Pictures'].sum() # 4)可视化 p1 = plt.figure() x=[1,2] labels = ['Universal Pictures' , 'Paramount Pictures'] y1=[U_profit,P_profit] y2 = [U_sum,P_sum] p1.add_subplot(1,2,1) plt.xticks(x,labels) plt.bar(x,y1) plt.xticks(x,labels) plt.legend('两家公司盈利情况') p1.add_subplot(1,2,2) plt.bar(x,y2) plt.xticks(x,labels) plt.legend('两家公司发行影片情况') plt.show()