创建一个series
import numpy as np
import pandas as pd
'''
创建series
'''
np.random.seed(1234)
arr1=np.random.randint(1,10,5)
print(arr1)
ser1=pd.Series(arr1,index=['a','b','c','d','e'])
print(ser1)
dict1={"a":1,"b":2,"c":3}
print(dict1)
ser2=pd.Series(dict1)
print(ser2)
ser3=pd.Series(3,index=['a','b','c'])
print(ser3)
'''
series常用属性
'''
print(ser1.index)
ser4=pd.Series(arr1,index=['a','b','c','d','e'],name='data')
print(ser4)
ser4.rename('haha',inplace=True)
print(ser4)
'''
series的特性
'''
print(ser1)
print(ser1[0])
print(ser1[1:4])
print(ser1[[1,3,4]])
print(ser1>6)
print(ser1[ser1>6])
print(np.exp(ser1))
print(ser1['c'])
print(ser1['c':'e'])
print(ser1[['a','c','d']])
ser1['a']=0
print(ser1)
print(ser1.get('b','找不着'))
ser5=pd.Series([1,2,3,4,5],['a','b','c','d','e'])
print(ser1+ser5)
ser6=pd.Series([1,2,3,4,5],['a','b','k','d','n'])
print(ser1+ser6)
创建一个DataFrame
import pandas as pd
import numpy as np
'''
创建DataFrame
pd.DataFrame(data=None,index=None,columns=None,dtype=None,copy=False)
'''
np.random.seed(1234)
arr=np.random.randint(1,100,(3,7))
print(arr)
df=pd.DataFrame(arr,index=['a','b','c'],columns=['A','B','C','D','E','F','G'])
print(df)
dict1={'员工姓名':['张三','李四','王五'],'销售业绩':[20000,30000,50000],'提成':[4000,6000,10000]}
df1=pd.DataFrame(dict1)
print(df1)
dict2={'员工姓名':pd.Series(['张三','李四','王五']),'销售业绩':pd.Series([20000,30000]),'提成':pd.Series([4000,6000,10000])}
df2=pd.DataFrame(dict2)
print(df2)
'''
DataFrame的列操作
'''
df2['基本工资']=2500
print(df2)
df2['创造收益']=df2['销售业绩']-df2['提成']-df2['基本工资']
print(df2)
df2['是否达标']=df2['创造收益']>20000
print(df2)
df2['性别']=pd.Series(['女','男'],index=[0,2])
print(df2)
df2.insert(6,'年龄',[25,27,30])
print(df2)
df2.pop('年龄')
print(df2)
del df2['性别']
print(df2)
df2.drop(columns=['是否达标'])
print(df2)
df2.drop(index=[0,1],inplace=True)
print(df2)
DataFrame的索引
import pandas as pd
import numpy as np
'''
DataFrame的索引
'''
np.random.seed(1234)
dict2={'员工姓名':pd.Series(['张三','李四','王五']),'销售业绩':pd.Series([20000,30000]),'提成':pd.Series([4000,6000,10000])}
df2=pd.DataFrame(dict2)
print(df2)
df2['基本工资']=2500
print(df2)
df2['创造收益']=df2['销售业绩']-df2['提成']-df2['基本工资']
print(df2)
df2['是否达标']=df2['创造收益']>20000
print(df2)
df2['性别']=pd.Series(['女','男'],index=[0,2])
print(df2)
df2.insert(6,'年龄',[25,27,30])
print(df2)
print(df2.loc[1])
print(df2.loc[1,"员工姓名":"创造收益"])
print(df2.loc[1:2])
df2.loc[3]=['小明',60000,12000,2500,np.nan,False,24,'男']
print(df2)
print(df2.loc[[0,1,3]])
print(df2.loc[:,['员工姓名','是否达标']])
print(df2.iloc[1])
print(df2.iloc[1,[0,2]])
print(df2[(df2.提成>=10000) | (df2.年龄<30)])
DataFrame的相关操作
import pandas as pd
import numpy as np
'''
DataFrame的相关操作
'''
'''
df.shape #查看数据框的形状
df.head(2) #查看数据框的前几行,默认前5行
df.tail(3) #查看数据框的结尾几行,默认后5行
df.index #查看行索引
df.columns #查看列索引
#查看数据框的详细信息
df.info()
#进行简单的描述统计
df.describe()
df.sort_index() #升序排序
df.sort_index(ascending=False) #降序排序
df.sort_values("销售业绩") #默认升序排序
df["工龄"] = [1,3,7,5,6,4,2]
df.sort_values(["销售业绩","工龄"]) #默认升序排序
df.sort_values(["销售业绩","工龄"],ascending=False) #降序排序
df.sort_values(["销售业绩","工龄"],ascending=[False,True]) #降序排序
'''
dict2={'员工姓名':pd.Series(['张三','李四','王五','瓜六']),'销售业绩':pd.Series([20000,30000,np.nan,20000]),'提成':pd.Series([4000,6000,10000,4000])}
df2=pd.DataFrame(dict2)
df2['基本工资']=2500
df2['创造收益']=df2['销售业绩']-df2['提成']-df2['基本工资']
df2['是否达标']=df2['创造收益']>20000
df2['性别']=pd.Series(['女','男','男'],index=[0,2,3])
df2.insert(6,'年龄',[25,27,30,27])
print(df2)
print(df2.isnull())
print(df2.isnull().sum())
print(df2.isnull().mean())
print(df2.isnull().sum()/df2.shape[0])
print(df2.性别.value_counts())
print(df2.性别.fillna('男'))
print(df2.fillna(df2.mean()))
print(df2.fillna(method='ffill'))
print(df2.fillna(method='bfill',limit=1))
print(df2.replace(np.nan,0))
print(df2.replace(np.nan,df2.median()))
print(df2.replace(method='ffill'))
print(df2.dropna())
print(df2.dropna(axis=1))
'''
数据表合并
pd.concat()
pd.merge()
'''
df_1=df2.iloc[:2]
df_2=df2.iloc[2:]
print(df_1)
print(df_2)
print(pd.concat([df_1,df_2],keys=['df_1','df_2']))
df_s=df2.iloc[:,0:4]
df_p=df2.iloc[:,[0,4,5,6,7]]
print(df_s)
print(df_p)
print(pd.concat([df_s,df_p],axis=1,keys=['df_s','df_p'],names=['来源','索引']))
print(pd.merge(df_s,df_p,on='员工姓名'))
'''
分组运算
df.groupby()
'''
gp1=df2.groupby('性别')
print(len(gp1))
print(gp1.size())
print(gp1.mean())
print(gp1['销售业绩'].mean())
gp2=df2.groupby(['性别','是否达标'],as_index='False')
print(gp2.size())
print(gp1['销售业绩'].agg([np.mean,np.std]).rename(columns={'mean':"平均",'std':'标准差'}))
print(gp1.agg({'销售业绩':np.mean,'提成':np.std}))
'''
数据透视表
pd.pivot_table()
'''
print(pd.pivot_table(df2,values='创造收益',index='员工姓名',columns='年龄'))
print(pd.pivot_table(df2,values='创造收益',index='员工姓名',columns='年龄',aggfunc=np.sum))
print(pd.pivot_table(df2,values=['创造收益','销售业绩'],index=['员工姓名','性别'],columns=['年龄','是否达标'],aggfunc=[np.sum,np.mean]))
'''
数据的读入和导出
pd.read_csv()
pd.read_excel()
df.to_csv()
df.to_excel()
'''
df_test=pd.read_csv('D:/TestFile/from.txt',sep=',')
print(df_test)
df_test1=pd.read_excel('D:/code/python/销售数据.xlsx',sheet_name=0)
print(df_test1)
df2.to_csv('D:/code/python/测试数据.csv',encoding='utf-8')
df2.to_csv('D:/code/python/测试数据.txt',encoding='utf-8')
df2.to_excel('D:/code/python/测试数据.xlsx',encoding='utf-8',sheet_name='员工表')
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)