数据分析案例

数据来源：kaggle的TMDB 5000 Movie Dataset数据集

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

# 一、准备操作
# 1. 读取文件
data_credits = pd.read_csv('tmdb_5000_credits.csv',encoding='utf-8')
data_movies = pd.read_csv('tmdb_5000_movies.csv',encoding='utf-8')
# 2. 查看表的信息
print(data_credits.info(),data_movies.info())
'''
data_credicts:
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object         #演员列表
 3   crew      4803 non-null   object

data_movies:
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object    #风格
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64    #电影时长
 14  spoken_languages      4803 non-null   object 
 15  status                4803 non-null   object 
 16  tagline               3959 non-null   object 
 17  title                 4803 non-null   object 
 18  vote_average          4803 non-null   float64
 19  vote_count            4803 non-null   int64  

'''

# 3. 表的合并
#    以credits中的movie_id和movies中的id为主键进行合并
data = pd.merge(data_credits, data_movies, left_on = ['movie_id','title'], right_on = ['id','title'], how='outer')
print(data.info())

# 二.数据分析
# *1.  电影类型如何随着时间的推移而变化？ genres/release_date*
#  由info()可知release_date数据量只有4802 说明有一个空值
#   1）查找空值所在记录的电影名称  [4553    America Is Still the Place]
print(data.loc[data['release_date'].isnull(),'title'])
#   2）百度该电影上映日期 2014-06-01 将其填入空值
data['release_date'].fillna('2014-06-01',inplace=True)
# print(data.info()) # release_date 有4803行 填入成功
#   3）取出所有的电影类型：
#   在genres中 字典是以字符串的形式存放的：
#  print(type(data['genres'][0]))  <class 'str'>
#  因此我们需要把字符串转变为字典类型 这里使用到一个方法:json.loads
#  参考：https://www.php.cn/python-tutorials-424994.html
data['genres'] = data['genres'].transform(json.loads)
# print(type(data['genres'][0]))   #<class 'list'>
#  再把列表中的每一个字典中name键对应的值取出来
type_list = []
for i in range(data['genres'].size):     #对每一行数据进行遍历
    for dict in data['genres'][i]:       #对每一行的列表中每一个字典进行遍历
        if len(dict) != 0:               #当列表不为空时
            type = dict['name']          #取出字典中name的值
            type_list.append(type)
            message = dict['name']
            data.loc[i,['genres']] = ''.join(message)   #把列表嵌套的字典类型转换为字符串类型
type_set = set(type_list)                #通过集合去重
print(type_set)
print(data['genres'])
for j in range(data['genres'].size):
    for dict2 in data['genres'][j]:
         if len(dict1) != 0:
             message = dict2['name']
             data['genres'][j] = ''.join(message)
# 4)把上映时间的字符串类型转换为标准的日期类型
data['release_year'] = pd.to_datetime(data['release_date']).dt.year   #新增一列上映年份的列
# 5)字符串操作
for j in list(type_set):
    data[j] = data['genres'].str.contains(j).transform(lambda x:1 if x else 0)  #某种电影是否包含该类型 并将布尔值转换为数值
# print(data.shape)
# 6)分组
groupby_year = data.groupby(by='release_year')[list(type_set)].sum()
# 7)可视化
plt.figure()
#修改字体
plt.rcParams['font.sans-serif'] = 'SimHei'
#正常显示符号问题
plt.rcParams['axes.unicode_minus'] = False
x =  groupby_year.index
for type in list(type_set):
    y = groupby_year[type]
    print(type)
    plt.plot(x, y)
    plt.legend(labels=type)
plt.show()
#不同类型的电影总数占比
#不同类型的电影总数
# print(data[list(type_set)].sum().sort_values(ascending=False))
#画饼图
x1 = data[list(type_set)].sum().sort_values(ascending=False)
plt.figure()
plt.pie(x1,labels=list(type_set),autopct='%1.1f%%',pctdistance=0.5)
plt.show()
#画柱状图
plt.figure()
x2 = list(type_set)
y = data[list(type_set)].sum().sort_value()

plt.barh(x2,y)
plt.show()

*#2.电影类型与利润的关系*
#1)计算利润
data['profit'] = data['revenue'] - data['budget']
#2) 每种类型的电影的平均利润
movie_type_profit = []
for movie_type in list(type_set):
    # print(data.loc[data[movie_type]==1,'profit'])
    profit_avg = data.loc[data[movie_type]==1,'profit'].mean()
    movie_type_profit.append(profit_avg)
#构建DataFrame表格
movie = pd.DataFrame({'movie_type':list(type_set),'movie_profit':movie_type_profit})
print(movie)
#可视化
plt.figure()
plt.barh(list(type_set),movie_type_profit)
plt.show()
# 3.Universal和paramount两家公司的对比情况
#在表中找到全称‘Universal Pictures’和‘Paramount Pictures’ 字段名为‘production companies’
#1）构建两列数据，代表两个公司
data['Universal Pictures'] = data['production_companies'].str.contains('Universal Pictures')
data['Paramount Pictures'] = data['production_companies'].str.contains('Paramount Pictures')
# print(data['Universal Pictures'])   返回布尔值
#2)对比盈利情况
U_profit = data.loc[data['Universal Pictures'],'profit'].mean()
P_profit = data.loc[data['Paramount Pictures'],'profit'].mean()
# print(U_profit,P_profit)
# 3) 发行影片的情况
U_sum = data['Universal Pictures'].sum()
P_sum = data['Paramount Pictures'].sum()
# 4)可视化
p1 = plt.figure()
x=[1,2]
labels = ['Universal Pictures' , 'Paramount Pictures']
y1=[U_profit,P_profit]
y2 = [U_sum,P_sum]
p1.add_subplot(1,2,1)
plt.xticks(x,labels)
plt.bar(x,y1)
plt.xticks(x,labels)
plt.legend('两家公司盈利情况')

p1.add_subplot(1,2,2)
plt.bar(x,y2)
plt.xticks(x,labels)
plt.legend('两家公司发行影片情况')
plt.show()