import pandas as pd
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
1. 介绍Pandas
工具一:Series
obj = pd.Series([4, 7, -5, 3])
obj
0 4
1 7
2 -5
3 3
dtype: int64
print(obj.values)
print(obj.index) # 左闭右开
[ 4 7 -5 3]
RangeIndex(start=0, stop=4, step=1)
# 自定义索引
obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
print(obj2.index)
print(obj2['a'])
obj2
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
d 4
b 7
a -5
c 3
dtype: int64
用索引去改值
obj2['d'] = 6
obj2[['c', 'a', 'd']]
c 3
a -5
d 6
dtype: int64
布尔值索引
obj2[obj2 > 0]
d 6
b 7
c 3
dtype: int64
乘法
obj2 * 2
print(obj2)
d 6
b 7
a -5
c 3
dtype: int64
Series可以和numpy结合使用
# 指数
np.exp(obj2)
d 403.428793
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
'b' in obj2 # 是看index
True
用字典生成Series
sdata = {'Ohio': 35000, 'Taxas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3
Ohio 35000
Taxas 71000
Oregon 16000
Utah 5000
dtype: int64
# sdata = 'Ohio': 35000, 'Taxas': 71000, 'Oregon': 16000, 'Utah': 5000
# 可以在用字典生成的时候,来给index排序,通过指定index的顺序。
# 注意:如果index出现字典中没有的key,那么这个index不会少,但其对应的value会被看作NaN
states = ['California', 'Ohio', 'Oregon', 'Taxas']
obj4 = pd.Series(sdata, index = states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Taxas 71000.0
dtype: float64
是否空,返回布尔Series
pd.isnull(obj4) # 说明NaN就代表null obj4.isnull() 也可以的
California True
Ohio False
Oregon False
Taxas False
dtype: bool
pd.notnull(obj4)
California False
Ohio True
Oregon True
Taxas True
dtype: bool
有缺省的obj相加
obj3 + obj4
California NaN
Ohio 70000.0
Oregon 32000.0
Taxas 142000.0
Utah NaN
dtype: float64
Series对象本身、属性index都有名字
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Taxas 71000.0
Name: population, dtype: float64
改变索引
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
2. 工具二:DataFrame
字典生成DataFrame
data = dict({
'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year':[2000, 2001, 2002, 2001, 2002, 2003],
'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
})
frame = pd.DataFrame(data) #每一项是一列!!!!!!!
print(frame.columns)
frame
Index(['state', 'year', 'pop'], dtype='object')
|
state |
year |
pop |
0 |
Ohio |
2000 |
1.5 |
1 |
Ohio |
2001 |
1.7 |
2 |
Ohio |
2002 |
3.6 |
3 |
Nevada |
2001 |
2.4 |
4 |
Nevada |
2002 |
2.9 |
5 |
Nevada |
2003 |
3.2 |
只看前5行
frame.head(5)
|
state |
year |
pop |
0 |
Ohio |
2000 |
1.5 |
1 |
Ohio |
2001 |
1.7 |
2 |
Ohio |
2002 |
3.6 |
3 |
Nevada |
2001 |
2.4 |
4 |
Nevada |
2002 |
2.9 |
改变columns顺序
pd.DataFrame(data, columns = ['year', 'state', 'pop'])
|
year |
state |
pop |
0 |
2000 |
Ohio |
1.5 |
1 |
2001 |
Ohio |
1.7 |
2 |
2002 |
Ohio |
3.6 |
3 |
2001 |
Nevada |
2.4 |
4 |
2002 |
Nevada |
2.9 |
5 |
2003 |
Nevada |
3.2 |
添加index
# 添加没有的值会把所有的变成缺省
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = [str(i) for i in range(1, 7)])
frame2
|
year |
state |
pop |
debt |
1 |
2000 |
Ohio |
1.5 |
NaN |
2 |
2001 |
Ohio |
1.7 |
NaN |
3 |
2002 |
Ohio |
3.6 |
NaN |
4 |
2001 |
Nevada |
2.4 |
NaN |
5 |
2002 |
Nevada |
2.9 |
NaN |
6 |
2003 |
Nevada |
3.2 |
NaN |
frame2.index
Index(['1', '2', '3', '4', '5', '6'], dtype='object')
选一列
# 这里只能选一列,不能frame2['2'], 会报错
print(frame2['state']) # 一下都可以
print(frame2.year)
1 Ohio
2 Ohio
3 Ohio
4 Nevada
5 Nevada
6 Nevada
Name: state, dtype: object
1 2000
2 2001
3 2002
4 2001
5 2002
6 2003
Name: year, dtype: int64
选一行
frame2.loc['3']
year 2002
state Ohio
pop 3.6
debt NaN
Name: 3, dtype: object
给某一列赋值,一般用于给初始值
# 给一个得全赋值
frame2['debt'] = 16.5
frame2
|
year |
state |
pop |
debt |
1 |
2000 |
Ohio |
1.5 |
16.5 |
2 |
2001 |
Ohio |
1.7 |
16.5 |
3 |
2002 |
Ohio |
3.6 |
16.5 |
4 |
2001 |
Nevada |
2.4 |
16.5 |
5 |
2002 |
Nevada |
2.9 |
16.5 |
6 |
2003 |
Nevada |
3.2 |
16.5 |
frame2.debt = np.arange(6.)
frame2
|
year |
state |
pop |
debt |
1 |
2000 |
Ohio |
1.5 |
0.0 |
2 |
2001 |
Ohio |
1.7 |
1.0 |
3 |
2002 |
Ohio |
3.6 |
2.0 |
4 |
2001 |
Nevada |
2.4 |
3.0 |
5 |
2002 |
Nevada |
2.9 |
4.0 |
6 |
2003 |
Nevada |
3.2 |
5.0 |
# 这里可以看出,range必须放整数
frame2.debt = range(6.)
frame2
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/folders/9l/fdqtn9lj3mqd1mr7rsbr_qlc0000gn/T/ipykernel_57792/4184687594.py in <module>
1 # 这里可以看出,range必须放整数
----> 2 frame2.debt = range(6.)
3 frame2
TypeError: 'float' object cannot be interpreted as an integer
用Series给DataFrame的一列赋值
# 不管原来是什么,只有一赋值,空缺的就变成NaN
val = pd.Series([-1.2, -1.5, -1.7], index = ['2', '4', '5'])
frame2.debt = val
frame2
新加一列,可以是条件判断,写入值是布尔
frame2['eastern'] = (frame2.state == 'Ohio')
frame2
|
year |
state |
pop |
debt |
eastern |
1 |
2000 |
Ohio |
1.5 |
0.0 |
True |
2 |
2001 |
Ohio |
1.7 |
1.0 |
True |
3 |
2002 |
Ohio |
3.6 |
2.0 |
True |
4 |
2001 |
Nevada |
2.4 |
3.0 |
False |
5 |
2002 |
Nevada |
2.9 |
4.0 |
False |
6 |
2003 |
Nevada |
3.2 |
5.0 |
False |
删除一列
- 这里不可以使用frame2.eastern, 会报错
del frame2['eastern']
print(frame2.columns)
frame2
Index(['year', 'state', 'pop', 'debt'], dtype='object')
|
year |
state |
pop |
debt |
1 |
2000 |
Ohio |
1.5 |
0.0 |
2 |
2001 |
Ohio |
1.7 |
1.0 |
3 |
2002 |
Ohio |
3.6 |
2.0 |
4 |
2001 |
Nevada |
2.4 |
3.0 |
5 |
2002 |
Nevada |
2.9 |
4.0 |
6 |
2003 |
Nevada |
3.2 |
5.0 |
创建DataFrame,用嵌套字典
- 一级作为属性名columns
- 二级作为对应index名字
pop = dict({
'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2001: 1.7, 2002:3.6, 2000: 1.5}
})
frame3 = pd.DataFrame(pop)
frame3
|
Nevada |
Ohio |
2001 |
2.4 |
1.7 |
2002 |
2.9 |
3.6 |
2000 |
NaN |
1.5 |
用numpy给DataFrame转置
frame3.T
|
2001 |
2002 |
2000 |
Nevada |
2.4 |
2.9 |
NaN |
Ohio |
1.7 |
3.6 |
1.5 |
实现:把一个df中的一部分取出来,变成新的df
pdata = {
'Ohio':frame3['Ohio'][:],
'Nevada':frame3['Nevada'][:2]
}
pd.DataFrame(pdata)
|
Ohio |
Nevada |
2000 |
1.5 |
NaN |
2001 |
1.7 |
2.4 |
2002 |
3.6 |
2.9 |
DataFrame的index和columns都有名字了
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3
state |
Nevada |
Ohio |
year |
|
|
2001 |
2.4 |
1.7 |
2002 |
2.9 |
3.6 |
2000 |
NaN |
1.5 |
frame3的values属性
# 返回数组
frame3.values
array([[2.4, 1.7],
[2.9, 3.6],
[nan, 1.5]])
3. 索引对象
obj = pd.Series(range(3), index = ['a', 'b', 'c'])
obj
a 0
b 1
c 2
dtype: int64
# 提一下索引对象
index1 = obj.index
# 可见把索引变成一个序列了
index1[1:]
Index(['b', 'c'], dtype='object')
# 不可修改,会报错的
index1[1] = 'd'
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/var/folders/9l/fdqtn9lj3mqd1mr7rsbr_qlc0000gn/T/ipykernel_57792/681980289.py in <module>
1 # 不可修改,会报错的
----> 2 index1[1] = 'd'
~/miniforge3/envs/NLP_search/lib/python3.8/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
4583 @final
4584 def __setitem__(self, key, value):
-> 4585 raise TypeError("Index does not support mutable operations")
4586
4587 def __getitem__(self, key):
TypeError: Index does not support mutable operations
直接用pd生成一个索引的对象
Index = pd.Index(np.arange(3))
Index
Int64Index([0, 1, 2], dtype='int64')
obj2 = pd.Series([1.5, -2.5, 0], index = Index)
obj2
0 1.5
1 -2.5
2 0.0
dtype: float64
甚至可以判断,这个某个对象的索引是不是某个索引对象?
obj2.index is Index
True
3 in obj2.index
False
热知识:列(columns)也是索引对象——np.Index
frame3.columns
Index(['Nevada', 'Ohio'], dtype='object', name='state')
'Ohio' in frame3.columns
True
索引对象的方法和属性
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0LnJLeWd-1661789727182)(attachment:%E6%88%AA%E5%B1%8F2022-08-28%20%E4%B8%8A%E5%8D%881.50.39.png)]
4. 主要功能
reindexing
-
用于Series是改变行
-
用于DataFrame时可以改变行或列,默认是行
-
reindex的参数
- index 索引
- method 填充方法
- fill_value 缺失值的默认值
- 。。。
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
# 对应索引值相等直接复制,没有的index直接变成NaN,相当于重定序
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
# 用method来自动填充一些值
obj3 = pd.Series(['blue', 'purple', 'yellow'], [0, 2, 4])
obj3 = obj3.reindex(range(6), method = 'ffill') # 这个是前向填充的意思,就是没有的看上一个点 ffill前向 bfill后向
obj3
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
# 用于DataFrame时可以改变行或列,默认是行
frame = pd.DataFrame(np.arange(9).reshape(3, 3), index = ['a', 'c', 'd'], columns = ['Ohio', 'Texas', 'California'])
frame
|
Ohio |
Texas |
California |
a |
0 |
1 |
2 |
c |
3 |
4 |
5 |
d |
6 |
7 |
8 |
frame2 = frame.reindex(index = ['a', 'b', 'c', 'd'])
frame2
|
Ohio |
Texas |
California |
a |
0.0 |
1.0 |
2.0 |
b |
NaN |
NaN |
NaN |
c |
3.0 |
4.0 |
5.0 |
d |
6.0 |
7.0 |
8.0 |
states = ['Texas', 'Utah', 'California']
frame3 = frame.reindex(columns = states)
frame3
|
Texas |
Utah |
California |
a |
1 |
NaN |
2 |
c |
4 |
NaN |
5 |
d |
7 |
NaN |
8 |
Drop:从某个轴上删除条目
obj5 = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj5
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
new_obj = obj5.drop('c')
new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
data5 = pd.DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'there', 'four'])
data5
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
# 依旧默认删行,不想删行就加columns
data5.drop(['Ohio', 'Colorado'])
|
one |
two |
there |
four |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data5.drop(['two', 'four'], axis = 'columns') # 写成axis = 1也是可以的
|
one |
there |
Ohio |
0 |
2 |
Colorado |
4 |
6 |
Utah |
8 |
10 |
New York |
12 |
14 |
# 现在的drop都没有改变原来的对象,想直接原地修改,加个参数:inplace
data5.drop('Utah', inplace = True)
data5
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
New York |
12 |
13 |
14 |
15 |
索引,选择,过滤
obj5 = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj5
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
# 选择某一行,不仅可以用index索引,还可以直接用数字,就是下标索引
print(obj5['b'])
print(obj5[1])
print(obj5[2:4])
print(obj5[[1, 3]])
print(obj5[obj5.values < 2])
# 还能这样??
print(obj5['a':'c'])
obj5['b':'c'] = 5
obj5
1.0
1.0
c 2.0
d 3.0
dtype: float64
b 1.0
d 3.0
dtype: float64
a 0.0
b 1.0
dtype: float64
a 0.0
b 1.0
c 2.0
dtype: float64
a 0.0
b 5.0
c 5.0
d 3.0
dtype: float64
变成是DataFrame的情况
data5 = pd.DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'there', 'four'])
data5
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
# 这里就默认选列了,很奇怪
data5['two']
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int64
data5[data5.index == 'Ohio']
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
data5[:2] #又变成行了,字符是列,下标是行?
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
data5[data5['there'] > 5]
|
one |
two |
there |
four |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data5['there'] > 5
Ohio False
Colorado True
Utah True
New York True
Name: there, dtype: bool
print(data5.values < 5)
data5 < 5
[[ True True True True]
[ True False False False]
[False False False False]
[False False False False]]
|
one |
two |
there |
four |
Ohio |
True |
True |
True |
True |
Colorado |
True |
False |
False |
False |
Utah |
False |
False |
False |
False |
New York |
False |
False |
False |
False |
loc与iloc
- loc是要index的名字
- iloc 只需要index的下标就行
- 为什么要区分这两个?
- 当index本身设置的值也是数字的时候,输入data[0]计算机会有歧义,所以一般用loc和iloc先声明一下
- 注意:loc取index本身的时候,是左闭右闭
- iloc是左闭右开
data6 = pd.DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'there', 'four'])
data6
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data6.loc[['Colorado', 'New York'], ['two', 'there']]
|
two |
there |
Colorado |
5 |
6 |
New York |
13 |
14 |
data6.iloc[[1, 3], [1, 2]]
|
two |
there |
Colorado |
5 |
6 |
New York |
13 |
14 |
# loc取index本身的时候,是左闭右闭
data6.loc[:'Utah', 'one':'two']
|
one |
two |
Ohio |
0 |
1 |
Colorado |
4 |
5 |
Utah |
8 |
9 |
# 用iloc,这里就是左闭右开了
data6.iloc[:2]
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
5. 四则运算
- Series1 + Series2 索引相同就直接相加,遇到没见过索引就NaN
- frame1 + frame2 还是对应值相加,没有就NaN
- 几个四则运算
- obj.add
- sub
- div
- floordiv
- mul
- pow
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), index = np.arange(3), columns = list('abc'))
df1
|
a |
b |
c |
0 |
0 |
1 |
2 |
1 |
3 |
4 |
5 |
2 |
6 |
7 |
8 |
df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index = np.arange(4), columns = list('abcd'))
df2
|
a |
b |
c |
d |
0 |
0 |
1 |
2 |
3 |
1 |
4 |
5 |
6 |
7 |
2 |
8 |
9 |
10 |
11 |
3 |
12 |
13 |
14 |
15 |
df1.loc[0, 'b'] = np.nan
df1
|
a |
b |
c |
0 |
0 |
1.0 |
2 |
1 |
3 |
NaN |
5 |
2 |
6 |
7.0 |
8 |
df2.loc[1, 'b'] = np.nan
df2.loc[2, 'd'] = np.nan
df2
|
a |
b |
c |
d |
0 |
0 |
1.0 |
2 |
3.0 |
1 |
4 |
NaN |
6 |
7.0 |
2 |
8 |
9.0 |
10 |
NaN |
3 |
12 |
13.0 |
14 |
15.0 |
# 如果仅仅是相加
df1 + df2
|
a |
b |
c |
d |
0 |
0.0 |
2.0 |
4.0 |
NaN |
1 |
7.0 |
NaN |
11.0 |
NaN |
2 |
14.0 |
16.0 |
18.0 |
NaN |
3 |
NaN |
NaN |
NaN |
NaN |
# 用add可以让df1先扩充大小为df2,然后补充设定的确实值,再与df2相加
# 注意这里,reindex也可以有fill_value这个参数,直接填充
df1.add(df2, fill_value = 0)
|
a |
b |
c |
d |
0 |
0.0 |
2.0 |
4.0 |
3.0 |
1 |
7.0 |
NaN |
11.0 |
7.0 |
2 |
14.0 |
16.0 |
18.0 |
NaN |
3 |
12.0 |
13.0 |
14.0 |
15.0 |
1/df1
|
a |
b |
c |
0 |
inf |
1.000000 |
0.500 |
1 |
0.333333 |
NaN |
0.200 |
2 |
0.166667 |
0.142857 |
0.125 |
dataframe - series
- 当用一个df减去一个series时,每一行/列都减去对应series的值
- 为了避免歧义,直接加axis = xxx
data6 = pd.DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'there', 'four'])
data6
|
one |
two |
there |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
series = data6.loc[:, 'two']
series = series[['Colorado', 'Ohio', 'New York', 'Utah']]
series
Colorado 5
Ohio 1
New York 13
Utah 9
Name: two, dtype: int64
data6.sub(series, axis = 0)
|
one |
two |
there |
four |
Colorado |
-1 |
0 |
1 |
2 |
New York |
-1 |
0 |
1 |
2 |
Ohio |
-1 |
0 |
1 |
2 |
Utah |
-1 |
0 |
1 |
2 |
series = data6.loc['New York', :]
data6.sub(series, axis = 'columns')
|
one |
two |
there |
four |
Ohio |
-12 |
-12 |
-12 |
-12 |
Colorado |
-8 |
-8 |
-8 |
-8 |
Utah |
-4 |
-4 |
-4 |
-4 |
New York |
0 |
0 |
0 |
0 |
6. 一些映射
自定义函数作为映射函数
# 简单说明
s = pd.Series([1, 2, 3], index = list('abc'))
s.max()
3
frame7 = pd.DataFrame(np.random.randn(4, 3), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame7
|
b |
d |
e |
Utah |
-0.204708 |
0.478943 |
-0.519439 |
Ohio |
-0.555730 |
1.965781 |
1.393406 |
Texas |
0.092908 |
0.281746 |
0.769023 |
Oregon |
1.246435 |
1.007189 |
-1.296221 |
# 对每一列进行操作
f = lambda x: x.max() - x.min()
frame7.apply(f, axis = 0)
b 1.802165
d 1.684034
e 2.689627
dtype: float64
def f(x):
return pd.Series([x.min(), x.max()], index = ['min', 'max'])
frame7.apply(f, axis = 1)
|
min |
max |
Utah |
-0.519439 |
0.478943 |
Ohio |
-0.555730 |
1.965781 |
Texas |
0.092908 |
0.769023 |
Oregon |
-1.296221 |
1.246435 |
pd.applymap()对每个元素
format = lambda x: '%.2f'% x
frame7.applymap(format)
|
b |
d |
e |
Utah |
-0.20 |
0.48 |
-0.52 |
Ohio |
-0.56 |
1.97 |
1.39 |
Texas |
0.09 |
0.28 |
0.77 |
Oregon |
1.25 |
1.01 |
-1.30 |
# Apply a function along an axis of the DataFrame.
pd.DataFrame.apply?
# Apply a function to a Dataframe elementwise.
pd.DataFrame.applymap??
frame7.loc[:, 'e'].map(format) # 对某一列进行操作
Utah -0.52
Ohio 1.39
Texas 0.77
Oregon -1.30
Name: e, dtype: object
7. 排序
Series排序
obj8 = pd.Series(range(4), index = list('dabc'))
obj8
d 0
a 1
b 2
c 3
dtype: int64
print(obj8.sort_index())
obj8.sort_values() #如果有缺失值,就会被放到尾部
a 1
b 2
c 3
d 0
dtype: int64
d 0
a 1
b 2
c 3
dtype: int64
DataFrame排序
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index = ['three', 'one'], columns = list('dabc'))
frame
|
d |
a |
b |
c |
three |
0 |
1 |
2 |
3 |
one |
4 |
5 |
6 |
7 |
frame.sort_index(axis = 1)
|
a |
b |
c |
d |
three |
1 |
2 |
3 |
0 |
one |
5 |
6 |
7 |
4 |
frame.sort_index(axis=0)
|
d |
a |
b |
c |
one |
4 |
5 |
6 |
7 |
three |
0 |
1 |
2 |
3 |
frame.sort_values(by='b')
|
d |
a |
b |
c |
three |
0 |
1 |
2 |
3 |
one |
4 |
5 |
6 |
7 |
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
|
b |
a |
0 |
4 |
0 |
1 |
7 |
1 |
2 |
-3 |
0 |
3 |
2 |
1 |
frame.sort_values(by = 'b')
|
b |
a |
2 |
-3 |
0 |
3 |
2 |
1 |
0 |
4 |
0 |
1 |
7 |
1 |
# 排序优先次序
frame.sort_values(by = ['a', 'b'])
|
b |
a |
2 |
-3 |
0 |
0 |
4 |
0 |
3 |
2 |
1 |
1 |
7 |
1 |
rank()
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
0 7
1 -5
2 7
3 4
4 2
5 0
6 4
dtype: int64
# 按照值 从小到大排序,返回对应下标的值是排名
obj.rank()
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
# 先看到的优先
obj.rank(method = 'first')
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
# 倒序排名,max是说:两者并列第1和2名,那么就都写成2
obj.rank(ascending = False, method = 'max')
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
|
b |
a |
0 |
4 |
0 |
1 |
7 |
1 |
2 |
-3 |
0 |
3 |
2 |
1 |
frame.rank(axis = 1)
|
b |
a |
0 |
2.0 |
1.0 |
1 |
2.0 |
1.0 |
2 |
1.0 |
2.0 |
3 |
2.0 |
1.0 |
8. 统计函数
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
|
b |
a |
0 |
4 |
0 |
1 |
7 |
1 |
2 |
-3 |
0 |
3 |
2 |
1 |
frame.loc[1, 'b'] = np.NaN
frame
|
b |
a |
0 |
4.0 |
0 |
1 |
NaN |
1 |
2 |
-3.0 |
0 |
3 |
2.0 |
1 |
frame.sum(axis = 0) # axis = 0是默认
# 有缺失值会被默认为0
# 如果不想默认为0,那就加一个skipna参数,skipna = False
b 3.0
a 2.0
dtype: float64
9. 归约函数
# df.idxmax() #按列:最大值的索引值
# df.cumsum() #按列求累和
# df.describe() #一次产生多个汇总统计,就是把所有什么count之类的全显示出来
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
obj.describe()
count 16
unique 3
top a
freq 8
dtype: object
10. 唯一值筛选计数
obj = pd.Series(list('cadaabbcc'))
obj
0 c
1 a
2 d
3 a
4 a
5 b
6 b
7 c
8 c
dtype: object
# 去重
obj.unique()
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts() # 默认排序了,从大到小,不希望排序加一个sort = False
c 3
a 3
b 2
d 1
dtype: int64
obj.value_counts(sort = False)
c 3
a 3
d 1
b 2
dtype: int64
# 判断是不是在某个列表里
mask = obj.isin(['b', 'c'])
mask
0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
dtype: bool
# 布尔值索引
obj[mask]
0 c
5 b
6 b
7 c
8 c
dtype: object
to_match = pd.Series(list('cabbca'))
to_match
0 c
1 a
2 b
3 b
4 c
5 a
dtype: object
unique_val = pd.Series(list('cba'))
# 对每一个在to_match的元素,寻找它在uniqueval里的索引值
pd.Index(unique_val).get_indexer(to_match)
array([0, 2, 1, 1, 0, 2])
dt = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
dt
|
Qu1 |
Qu2 |
Qu3 |
0 |
1 |
2 |
1 |
1 |
3 |
3 |
5 |
2 |
4 |
1 |
2 |
3 |
3 |
2 |
4 |
4 |
4 |
3 |
4 |
# 这个操作好帅
dt.apply(pd.value_counts).fillna(0)
|
Qu1 |
Qu2 |
Qu3 |
1 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
2.0 |
1.0 |
3 |
2.0 |
2.0 |
0.0 |
4 |
2.0 |
0.0 |
2.0 |
5 |
0.0 |
0.0 |
1.0 |