2020-03-03 21:37 前端工程师

关注

Python3 数据科学入门

环境搭建

数据科学工作流

Inquire
Obtain
Scrub
Explore
Model
Interpret

Anaconda 和 Jupyter notebook

conda：可扩展的包管理工具

安装
更新
创建沙盒

沙盒操作

conda create --name python37 python=3.7 # 创建新的环境

activate python37 # 激活一个环境

deactivate python37 # 退出一个环境

conda remove --name python37 --all # 删除一个环境

Conda包管理

conda install numpy # 安装包

conda list #查看已安装的包
conda list -n python37 # 查看指定环境已安装的包

conda remove -n python37 numpy

Jupyter的使用

Numpy

数据科学领域5个最常用的库

Numpy
- N维数组(矩阵)，高效的index，不需要循环；
- 开源免费，运行效率足以和C/Matlab媲美
Scipy
- 依赖Numpy
- 专为科学和工程设计
- 实现了多种常用科学计算：线性代数、傅里叶变换、信号和图像处理
Pandas
- 结构化数据分析利器(依赖Numpy)
- 提供了多种高级数据结构：Time-Series, DataFrame, Panel
- 强大的数据索引和处理能力
Matplotlib
- Python 2D绘图领域最广泛的套件
- 基本能取代Matlab的绘图功能
- 通过mplot3d可以绘制精美的3D图
Scikit-learn
- 机器学习的Python模块
- 建立在Scipy之上
- 简单易学的API接口

矩阵运算

数学知识，同线性代数里面的一样。

创建Array

import numpy as np

list_1 = [1 , 2, 3, 4]
array_1 = np.array(list_1)

list_2 = [5, 6, 7, 8]
array_2 = np.array([list_1, list_2]) # 二维数组

array_2.shape # m * n
array_2.size # array的数组
array_2.dtype # array里面元素的数据类型， 不一致取精度最高的

array_4 = np.arange(1, 10) # 左闭右开

array_5 = np.arange(1 ,10, 2) # 间距为2

np.zeros(5) # 1*5全零矩阵
np.zeros(2, 3) # 2*3全零矩阵

array[0]
array[1:5]
array[1][0]
array[:2, 1:]

数组操作

import numpy as np

np.random.randn(10)
np.random.randint(10, size = (2,3))

# reshape改变数组形状
np.zeros(20).reshape(4,5)

# 数组加减乘除
a + b
a - b
a * b
a / b

矩阵操作

np.mat([[1, 2, 3], [4, 5, 6]])

# 将array转为矩阵
np.mat(array)

A = np.mat(a)
B = np.mat(b)
A + B
A * B # A的列数要等于B的行数

Array常用函数

np.unique(a)

sum(a)
sum(a[:, 0]) # 第一列的和

a.max()
max(a[0])
max(a[:, 0])

Array的input和output

传统python序列化

import pickle
import numpy as np

x = np.arange(10)
f = open('x.pkl', 'wb')
pickle.dump(x,f)
f.close()

f = open('x.pkl', 'rb')
pickle.load(f)
f.close()

numpy提供的序列化

np.save('one_array', x)
np.load('one_array.npy')

保存多个数组到一个目录

np.savez('two_array.npz', a = x, b = y)
c = np.load('two_array.npz')
c['a']
c['b']

Pandas

Series

创建

import numpy as np
import pandas as pd

# list
s1 = pd.Series([1, 2, 3, 4])
s1.values
s1.index

# np array
s2 = pd.Series(np.arange(10))

# dict
s3 = pd.Series({"1" : 1, "2" : 2, "3" : 3})

# 指定index
s4 = pd.Series([1, 2, 3, 4], index = ["A", "B", "C", "D"])
s4.values
s4.index

操作

s4["A"]
s4[s4 > 2]

# 转化为py的字典
s4.to_dict()

s5 = pd.Series(s4.to_dict())

index_1 = ["A", "B", "C", "D", "E"]
s6 = pd.Series(s5, index = index_1) # E NaN
pd.isnull(s6) # E True
pd.notnull(s6) # E False

s6.name = "demo"
s6.index.name = "demo index"

DataFrame

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrower
link = 'https://www.tiobe.com/tiobe-index/'
webbrower.open(link)

# 从剪切板创建
df = pd.read_clipcoard()
type(df)
df.columns
df_new = DataFrame(df, columns = ['Sep 2016', 'Programming Language'])
df['Sep 2016']
type(df['Sep 2016']) # Series

df_new['Sep 2018'] = np.arange(10)
df_new['Sep 2018'] = pd.Series(np.arange(10))
df_new['Sep 2018'] = pd.Series([100, 200], index = [1, 2])

深入理解Series和DataFrame

Series

data = {
    'Country': ['Belgium', 'India', 'Brazil'],
    'Captial': ['Brussels', 'New Delhi', 'Brasilia'],
    'Population': [11190846, 1303171035, 207847528]
}

s1 = pd.Series(data['Country'], index = ['A', 'B', 'C'])
s1.values
s1.index

DataFrame

df1 = pd.DataFrame(data)
df1['Country']
type(df1['Country']) # Series

# 访问某一行
for raw in df1.iterrows() :
    print(row)
    print(type(row)) # tuple
    print(row[0]) # index
    print(row[1]) # Series
    pass

# 由Series创建DataFrame
s1 = pd.Series(data['Capital'])
s2 = pd.Series(data['Country'])
s2 = pd.Series(data['Population'])
df_new = pd.DataFrame([s1, s2, s3], index=[...])
df_new = df_new.T # 转置

DataFrame的IO操作

df.read_clipboard()
df.to_clipboard()

df.read_csv('df.csv')
df.to_csv('df.csv')

df.read_json(jsonVar)
df.to_json()

df.read_html('df.html')
df.to_html('df.html')

df.read_excel('df.xlsx')
df.to_excel('df.xlsx')

DataFrame的Selecting和Indexing

indb = pd.read_csv('xxx.csv')
indb.head() # 前5行
indb.head(10) # 10行
indb.tail() # 后5行
indb.tail(10) # 10行

sub_df = indb[['director_name', 'movie_title', 'indb_score']]
sub_df.iloc[10:20; :] # 10-20行(左闭右开)，所有列
sub_df.iloc[10:20; 0:2 ] # ... 0-2列(左闭右开) 基于位置信息，与名字无关

indb.loc[15:17, : 'movie_title'] # 15,17不是index，是label，15和17都包含

Reindex

series reindex

shift + tab Jupyter显示帮助

s1 = Series([1, 2, 3, 4], index = ['A', 'B', 'C', 'D'])
s1.reindex(index = ['A', 'B', 'C', 'D', 'E']) # 新的index对应的值为NaN
s1.reindex(index = ['A', 'B', 'C', 'D', 'E'], fill_value = 10) # 新的index对应的值为10
s1.reindex(index = ['A', 'B'] # C D E被删掉了
s1.drop('A') # 删掉A

s2 = Series(['A', 'B', 'C'], index = [1, 5, 10])
s2.reindex(index=range(15)) # 1:A 5:B 10:C 其余index对应的是NaN
s2.reindex(index=range(15), method = 'fill') # 0:NaN 1-4:A 5-9:B 10-15:C

DataFrame Reindex

df1 = DataFrame(np.random.rand(25).reshape([5, 5]), index = ['A', 'B', 'D' ,'E', 'F'], columns = ['c1', 'c2', 'c3', 'c4', 'c5'])
df1.reindex(index=['A','B','C', 'D', 'E', 'F'], columns = ['c1', 'c2', 'c3', 'c4', 'c5', 'c6']) # C:NaN c6:NaN
df1.drop('A', axis = 0) # 删掉A axio=0是index axios=1是columns

NaN

NaN: Not a Number

n = np.nan # 创建类型为NaN的数据类型
type(n) # float
m = 1
m + n # nan

s1 = Series([1, 2, np.nan, 3, 4], index = ['A', 'B', 'C', 'D', 'E'])
s1.isnull() # 返回新的Series C:True 其它的对应为False
s1.notnull() # 同上面相反
s1.dropna() # 将value是NaN的drop掉

df.isnull() # value是NaN的就是True，不是就是False
df.notnull() # 同上面相反
df.dropna(axios=0) # 删掉行里面有NaN的行(index) axios=1(列)
df.dropna(axios=0, how="any") # 只要一行有NaN就会被删
df.dropna(axios=0, how="all") # 一行全部都是NaN才会被删
df.dropna(axios=0, thresh=2) # 这一行的NaN大于2就会被删
df.fillna(value=1) # 将df里面的NaN替换为1
df.fillna(value={0: 0, 1:1, 2:2, 3:3}) # 第一列NaN的填0 第二列NaN的填1 ...

多级index

多级index Series

s1 = Series(np.random.randn(6), index = [['1', '1', '1', '2','2' , '2'],['a', 'b', 'c', 'a', 'b', 'c']])
s['1'] # 是一个Series 有a b c三个index
s['2'] # 是一个Series 有a b c三个index
s['1']['a']
s['2']['a']
s1[:, 'a'] # 一级index随意 二级index是'a' 返回一个Series

多级index Series转换为DataFrame

df1 = s1.unstack()
df2 = DataFrame([s1['1'], s1['2']])

DataFrame转换为多级index Series

s2 = df1.unstack()
s2 = df1.T.unstack()

多级index DataFrame

df = DataFrame(np.arange(16).reshape(4, 4), index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['BJ', 'BJ', 'SH', 'GZ'], [8, 9, 8, 8]])
df['BJ'] # 返回DataFrame
df['BJ'][8]

Maping和Replace

df1 = DataFrame({"城市": ["BJ", "SH", "GZ"], "人口": [1000, 2000, 1500]})
df1['GDP'] = Series([1000, 2000, 1500])

df2 = DataFrame({"城市": ["BJ", "SH", "GZ"], "人口": [1000, 2000, 1500]})
gdp_map = {"BJ": 1000, "SH": 2000, "GZ": 1500}
df2['GDP'] = df1['城市'].map(gdp_map)

s1 = Series(np.arange(10))
s1.replace(1, np.nan) # 返回新的Series
s1.replace({1: np.nan})
s1.replace([1, 2, 3], [10, 20, 30])

Pandas玩转数据

Series和DataFrame简单数***算

s1 = Series([1, 2, 3], index=['A', 'B', 'C'])
s2 = Series([4, 5, 6, 7], index=['B', 'C', 'D', 'E'])
s1 + s2 # A:NaN D,E:NaN

df1 = DataFrame(np.arange(4).reshape(2, 2), index = ['A', 'B'], columns=['BJ', 'SH'])
df2 = DataFrame(np.arange(9).reshape(3, 3), index = ['A', 'B', 'C'], columns=['BJ', 'SH', 'GZ']) 
df3 = df1 + df2
df3.sum()
df3.min()
df3.max()
df3.describe() # 返回统计的信息  mean std ...

Series和DataFrame的排序

s1 = Series(np.random.randn(10))
s1.values
s1.index

s2 = s1.sort_values()
s2 = s1.sort_values(ascending = False) # 降序

s2= s1.sort_index()

df1 = DataFrame(np.random.randn(40).reshape(8, 5), columns=['A', 'B', 'C', 'D', 'E'])
df1['A'].sort_values() # Series
df2 = df1.sort_values('A')
df2.sort_index()

重命名DataFrame的Index

df1 = DataFrame(np.arange(9).reshape(3, 3), index = ['BJ', 'SH', 'GZ'], columns = ['A', 'B', 'C'])
df1.index = Series(['bj', 'sh', 'gz'])
df1.inedx = df1.index.map(str.upper)
df1 = df1.rename(index = str.lower, columns = str.lower)
df1 = df1.rename(index = {'bj': 'beijing'}, columns = {'a': 'A'})

list = [1, 2, 3, 4]
list2 = [str(x) for x in list] # ['1', '2', '3', '4']
list3 = list(map(str, list))

def test_map(x) :
    return x + '_ABC'
df1.index.map(test_map)

DataFrame的merge操作

df1 = DataFrame({'key': ['A', 'B', 'C'], 'data_set1': [1, 2, 3]})
df2 = DataFrame({'key': ['A', 'B', 'C'], 'data_set2': [4, 5, 6]})

pd.merge(df1, df2) 
pd.merge(df1, df2, on="key")
pd.merge(df1, df2, how="inner") # outer
pd.merge(df1, df2, how="left") # right

Concatenate和Combine

arr1 = np.arange(9).reshape(3, 3)
arr2 = np.arange(9).reshape(3, 3)
np.concatenate([arr1, arr2 ]) # axis = 0

s1 = Series([4, 5], index = ['A', 'B'])
s2 = Series([1, 2, 3], index = ['X', 'Y', 'Z'])
pd.concat([s1, s2]) # axis = 0
pd.concat([s1, s2], axis = 1) # DataFrame

df1 = DataFrame(np.random.randn(4, 3), columns = ['X', 'Y', 'Z'])
df1 = DataFrame(np.random.randn(3, 3), columns = ['X', 'Y', 'A'])
pd.concat([df1, df2])

s1 = Series([2, np.nan, 4, np.nan], index = ['A', 'B', 'C', 'D'])
s2 = Series([1, 2, 3, 4], index = ['A', 'B', 'C', 'D'])
s1.combine_first(s2) # 用s2填充s1，s1中的NaN会被s2填充

df1 = DataFrame({
    'X': [1, np.nan, 3, np.nan],
    'Y': [5, np.nan, 7, np.nan],
    'Z': [9, np.nan, 11, np.nan]
})
df2 = DataFrame({
    'Z': [np.nan, 10, np.nan, 12],
    'A': [1, 2, 3, 4],
})
df1.combine_first(df2)

通过apply进行数据预处理

s1 = Series(['a'] * 7978)
df['A'] = s1 # df有7978行
df['A'] = df['A'].apply(str.upper) # 将A这一列变大写

def foo(line):
    items = line.strip().split(' ')
    return Series([items[1], items[3], items[5]])
df_tmp = df['data'].apply(foo)
df_tmp = df_tmp.rename(columes = {0: "Symbol", 1: "Sqqno", 2: "Price"})
df_new = df.combine_first(df_tmp)
del df_new['data']

通过去重进行数据清洗

df['Seqno'].duplicated() # False不重复 True重复
df['Seqno'].drop_duplicates() # 将重复的删掉
df.drop_duplicates()
df.drop_duplicates(['Seqno']) # 以Seqno为基准
df.drop_duplicates(['Seqno'], keep='last') # 保留重复的最后一个

时间序列操作基础

from datetime import datetime
t1 = datetime(2009, 10, 20)

data_list = [
    datetime(2016, 9, 1),
    datetime(2016, 9, 10),
    datetime(2017, 9, 1),
    datetime(2017, 9, 20),
    datetime(2017, 10, 1)
]
s1 = Series(np.random.rand(5), index=date_list)
s1[1] # 位置信息访问
s1[datetime(2016, 9, 10)]
s1['2016-9-10']
s1['20160910']
s1['2016-09'] # 2016年9月的数据
s1['2016'] # 2016年的数据

date_list_new = pd.date_range(start='2016-01-01', periods=100) # freq = 'D'
date_list_new = pd.date_range(start='2016-01-01', periods=100, freq = 'W-MON') # 频率周 周一开始

时间序列数据的采样和画图

t_range = pd.date_range(start='2016-01-01', end = '2016-12-31')
s1 = Series(np.random.randn(len(t_range)), index = t_range)
s1['2016-01'].mean()

s1_month = s1.resample('M').mean() #按月份采样 取平均值
s1.resample('H').ffill() # foreward填充
s1.resample('H').bfill() # backward填充

t_range = pd.date_range('2016-01-01', '2016-12-31', freq='H')
stock-df = DataFrame(index = t_range)
stock_df['BABA'] = np.random.randint(80, 160, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30, 50, size=len(t_range))
stock_df.plot()
import matplotlib.pyplot as plt
plt.show()

数据分箱技术Binning

score_list = np.random.randint(25, 100, size = 20)
bins = [0, 59, 70, 80, 100]
score_cut = pd.cut(score_list, bins)
pd.value_count(score_cut)

df = DataFrame()
df['score'] = score_list
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df['Categories'] = pd.cut(df['score'], bins, labels = ['Low', 'OK', 'Good', 'Great'])

数据分组技术GroupBy

g = df.groupby(by = df['city'])
g.groups
df_bj = g.get_group('BJ')
df_bj.mean()
g.mean() # DataFrame

GroupBy = Split + Apply + Combine

dict(list(g))['BJ']
for name, group_df in g:
    print(name)
    print(group_df)

数据聚合技术Aggregation

def foo(attr):
    return attr.max() - attr.min()
g.agg(foo)

g_new = df.groupby(['city', 'wind'])
g_new.groups
g_new.get_group(('BJ', 3))
for (name1, name2), group in g_new:
    pass

透视表

pd.pivot_table(df, index = ['Name'])
pd.pivot_table(df, index = ['Name'], aggfunc='sum', values = ['Price'])
pd.pivot_table(df, index = ['Name'], aggfunc='sum', values = ['Price'], fill_value=0)