pd.pivot_table方法
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),'E' : np.random.randn(12)})
print(df,end='\n\n')
df = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'],aggfunc='sum')
print(df)
运行结果:
A B C D E
0 one A foo 0.600810 -0.175178
1 one B foo 0.052015 0.542310
2 two C foo -0.316955 -1.195068
3 three A bar 0.810066 -1.012746
4 one B bar -0.836399 1.483205
5 one C bar -0.187096 0.145748
6 two A foo 0.878007 -1.266941
7 three B foo -0.226375 -0.921724
8 one C foo 0.073522 -1.018228
9 one A bar -0.541353 -0.235584
10 two B bar 1.263191 1.315970
11 three C bar 0.282756 -0.423348
C bar foo
A B
one A -0.541353 0.600810
B -0.836399 0.052015
C -0.187096 0.073522
three A 0.810066 NaN
B NaN -0.226375
C 0.282756 NaN
two A NaN 0.878007
B 1.263191 NaN
C NaN -0.316955
Process finished with exit code 0
def func2(df):
bins = [0,100,200,300,400,500,1000,9999]
labels = ['价格≤100元','100<价格≤200元','200<价格≤300元','300<价格≤400元','400元<价格≤500元','500元<价格≤1000元', '价格>1000']
df = df.loc[df['货物名称'].isin(['铝棒', '铝锭', '铝杆'])]
df = pd.pivot_table(df, values='数量(吨)', index=pd.cut(df['合同加工费'], bins=bins, labels=labels), columns=['货物名称'], aggfunc='sum')
print(df)
pandas自定义二维数组
def func(df):
return pd.Series({
'价格>1000': df.loc[df['合同加工费'] > 1000, '数量(吨)'].sum(),
'500元<价格≤1000元': df.loc[(df['合同加工费'] > 500) & (df['合同加工费'] <= 100), '数量(吨)'].sum(),
'400元<价格≤500元': df.loc[(df['合同加工费'] > 400) & (df['合同加工费'] <= 500), '数量(吨)'].sum(),
'300<价格≤400元': df.loc[(df['合同加工费'] > 300) & (df['合同加工费'] <= 400), '数量(吨)'].sum(),
'200<价格≤300元': df.loc[(df['合同加工费'] > 200) & (df['合同加工费'] <= 300), '数量(吨)'].sum(),
'100<价格≤200元': df.loc[(df['合同加工费'] > 100) & (df['合同加工费'] <= 200), '数量(吨)'].sum(),
'价格≤100元': df.loc[df['合同加工费'] <= 100, '数量(吨)'].sum(),
})
resdf = df.loc[df['货物名称'].isin(['铝棒','铝锭','铝杆'])].groupby('货物名称').apply(func)
print(resdf.T)
func2(df)
DataFrame将一行拆分成多行
kk = {'a': [12, 24, 33], 'b': ['h', 'a', 's'], 'c': ['dd', 'gg,jj,xx', 'dfx,we,ued,qsx']}
df = pd.DataFrame(kk)
df = df.drop('c', axis=1).join(df['c'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('c'))
df1 = df['c'].str.split(',', expand=True)
df1 = df1.stack().reset_index(level=0).set_index('level_0')
df1 = df1.rename(columns={0: 'c'})
df1 = df1.join(df.drop('c', axis=1))
print(df1)
表格汇总1
import pandas as pd
df = pd.read_excel(r'E:\PyProjts\Python_Prefereneces/新建 Microsoft Excel 工作表.xlsx',
sheet_name=0,
usecols=[0, 1, 2, 3],
nrows=8)
print(df)
def func(df):
if 0 in df['id1']:
return df
else:
tempdf = df[df['类型'] == 'X']
tempdf['金额'] = df['金额'].sum()
return tempdf
df1 = df.groupby('id1').apply(func)
print(df1)
运行结果:
id1 id2 金额 类型
0 0 90001 100 NaN
1 0 90002 200 NaN
2 0 90003 300 NaN
3 80001 60001 100 X
4 80001 60002 200 N
5 80001 60003 300 P
6 80002 60004 100 N
7 80002 60005 200 X
id1 id2 金额 类型
id1
0 0 0 90001 100 NaN
1 0 90002 200 NaN
2 0 90003 300 NaN
80001 3 80001 60001 600 X
80002 7 80002 60005 300 X
Process finished with exit code 0
df.groupuby + df.replace数据清洗+归类
import pandas as pd
df = pd.read_excel(r'E:\PyProjts\Python_Prefereneces\df_1/源文件全部A股.xls', usecols='A:C')
df = df[df.iloc[:, 2].notnull()]
df['证券代码'].replace(r'(?i)(\d+)[^\da-z]+([a-z]+)', r'\2\1', regex=True, inplace=True)
def func(df):
key = df.iloc[:, 2].unique()
df[['证券代码']].to_csv(f'E:\PyProjts\Python_Prefereneces\df_1\\result/{key[0]}[东财].txt', header=False, index=False)
df.groupby(df.iloc[:, 2]).apply(func)
df.rolling()方法学习
import pandas as pd
df = pd.DataFrame({'1': ['A1', 'A2', 'A1', 'A2', 'A2', 'A1', 'A2'],
'2': ['B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1'],
'num': [1, 2, 1, 3, 4, 2, 1]},
index=[pd.Timestamp('20130101 09:00:00'),
pd.Timestamp('20130101 09:00:01'),
pd.Timestamp('20130101 09:00:02'),
pd.Timestamp('20130101 09:00:03'),
pd.Timestamp('20130101 09:00:04'),
pd.Timestamp('20130101 09:00:05'),
pd.Timestamp('20130101 09:00:06')])
print(df)
for name, group in df.groupby(['1', '2']):
print(name)
print(group)
print(df.groupby(['1', '2'])['num'].rolling(window='3s', closed='both').sum())
运行结果:
1 2 num
2013-01-01 09:00:00 A1 B1 1
2013-01-01 09:00:01 A2 B1 2
2013-01-01 09:00:02 A1 B1 1
2013-01-01 09:00:03 A2 B1 3
2013-01-01 09:00:04 A2 B1 4
2013-01-01 09:00:05 A1 B1 2
2013-01-01 09:00:06 A2 B1 1
('A1', 'B1')
1 2 num
2013-01-01 09:00:00 A1 B1 1
2013-01-01 09:00:02 A1 B1 1
2013-01-01 09:00:05 A1 B1 2
('A2', 'B1')
1 2 num
2013-01-01 09:00:01 A2 B1 2
2013-01-01 09:00:03 A2 B1 3
2013-01-01 09:00:04 A2 B1 4
2013-01-01 09:00:06 A2 B1 1
1 2
A1 B1 2013-01-01 09:00:00 1.0
2013-01-01 09:00:02 2.0
2013-01-01 09:00:05 3.0
A2 B1 2013-01-01 09:00:01 2.0
2013-01-01 09:00:03 5.0
2013-01-01 09:00:04 9.0
2013-01-01 09:00:06 8.0
Name: num, dtype: float64
Process finished with exit code 0
df.groupby + df[].iat对该列最后一个求和
import pandas as pd
df = pd.read_excel('./Book1.xlsm',usecols=[0,1,2],skiprows=2)
def func(df):
df['合同金额'].iat[-1] = df['每期租金支出不含税'].sum()
return df
df.groupby('SEQ_NUM').apply(func)