Pandas数据分析整理

pd.pivot_table方法

# 数据透视表一样行列转换
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),'E' : np.random.randn(12)})
print(df,end='\n\n')
df = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'],aggfunc='sum')
print(df)

运行结果：
        A  B    C         D         E
0     one  A  foo  0.600810 -0.175178
1     one  B  foo  0.052015  0.542310
2     two  C  foo -0.316955 -1.195068
3   three  A  bar  0.810066 -1.012746
4     one  B  bar -0.836399  1.483205
5     one  C  bar -0.187096  0.145748
6     two  A  foo  0.878007 -1.266941
7   three  B  foo -0.226375 -0.921724
8     one  C  foo  0.073522 -1.018228
9     one  A  bar -0.541353 -0.235584
10    two  B  bar  1.263191  1.315970
11  three  C  bar  0.282756 -0.423348

C             bar       foo
A     B                    
one   A -0.541353  0.600810
      B -0.836399  0.052015
      C -0.187096  0.073522
three A  0.810066       NaN
      B       NaN -0.226375
      C  0.282756       NaN
two   A       NaN  0.878007
      B  1.263191       NaN
      C       NaN -0.316955

Process finished with exit code 0

def func2(df):
    bins = [0,100,200,300,400,500,1000,9999]
    labels = ['价格≤100元','100＜价格≤200元','200＜价格≤300元','300＜价格≤400元','400元＜价格≤500元','500元＜价格≤1000元', '价格＞1000']
    df = df.loc[df['货物名称'].isin(['铝棒', '铝锭', '铝杆'])]
    df = pd.pivot_table(df, values='数量(吨)', index=pd.cut(df['合同加工费'], bins=bins, labels=labels), columns=['货物名称'], aggfunc='sum')
    print(df)

pandas自定义二维数组

def func(df):
    return pd.Series({
        '价格＞1000': df.loc[df['合同加工费'] > 1000, '数量(吨)'].sum(),
        '500元＜价格≤1000元': df.loc[(df['合同加工费'] > 500) & (df['合同加工费'] <= 100), '数量(吨)'].sum(),
        '400元＜价格≤500元': df.loc[(df['合同加工费'] > 400) & (df['合同加工费'] <= 500), '数量(吨)'].sum(),
        '300＜价格≤400元': df.loc[(df['合同加工费'] > 300) & (df['合同加工费'] <= 400), '数量(吨)'].sum(),
        '200＜价格≤300元': df.loc[(df['合同加工费'] > 200) & (df['合同加工费'] <= 300), '数量(吨)'].sum(),
        '100＜价格≤200元': df.loc[(df['合同加工费'] > 100) & (df['合同加工费'] <= 200), '数量(吨)'].sum(),
        '价格≤100元': df.loc[df['合同加工费'] <= 100, '数量(吨)'].sum(),
    })

resdf = df.loc[df['货物名称'].isin(['铝棒','铝锭','铝杆'])].groupby('货物名称').apply(func)
print(resdf.T)
func2(df)

DataFrame将一行拆分成多行

# DataFrame将一行按拆分成多行
kk = {'a': [12, 24, 33], 'b': ['h', 'a', 's'], 'c': ['dd', 'gg,jj,xx', 'dfx,we,ued,qsx']}
df = pd.DataFrame(kk)

df = df.drop('c', axis=1).join(df['c'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('c'))

df1 = df['c'].str.split(',', expand=True)
df1 = df1.stack().reset_index(level=0).set_index('level_0')
df1 = df1.rename(columns={0: 'c'})
df1 = df1.join(df.drop('c', axis=1))
print(df1)

表格汇总1

import pandas as pd

df = pd.read_excel(r'E:\PyProjts\Python_Prefereneces/新建 Microsoft Excel 工作表.xlsx',
                   sheet_name=0,
                   usecols=[0, 1, 2, 3],
                   nrows=8)
print(df)

# df1 = df[df['id1'] != 0].groupby('id1').agg('sum')

def func(df):
    if 0 in df['id1']:
        return df
    else:
        tempdf = df[df['类型'] == 'X']
        tempdf['金额'] = df['金额'].sum()
        return tempdf

df1 = df.groupby('id1').apply(func)

print(df1)

运行结果：
     id1    id2   金额   类型
0      0  90001  100  NaN
1      0  90002  200  NaN
2      0  90003  300  NaN
3  80001  60001  100    X
4  80001  60002  200    N
5  80001  60003  300    P
6  80002  60004  100    N
7  80002  60005  200    X
           id1    id2   金额   类型
id1                            
0     0      0  90001  100  NaN
      1      0  90002  200  NaN
      2      0  90003  300  NaN
80001 3  80001  60001  600    X
80002 7  80002  60005  300    X

Process finished with exit code 0

df.groupuby + df.replace数据清洗+归类

import pandas as pd

df = pd.read_excel(r'E:\PyProjts\Python_Prefereneces\df_1/源文件全部A股.xls', usecols='A:C')
df = df[df.iloc[:, 2].notnull()]
df['证券代码'].replace(r'(?i)(\d+)[^\da-z]+([a-z]+)', r'\2\1', regex=True, inplace=True)


def func(df):
    key = df.iloc[:, 2].unique()
    df[['证券代码']].to_csv(f'E:\PyProjts\Python_Prefereneces\df_1\\result/{key[0]}[东财].txt', header=False, index=False)


df.groupby(df.iloc[:, 2]).apply(func)
# print(df.groupby(df.iloc[:, 2]).get_group('专业服务'))

df.rolling()方法学习

import pandas as pd
df = pd.DataFrame({'1': ['A1', 'A2', 'A1', 'A2', 'A2', 'A1', 'A2'],
                   '2': ['B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1'],
                   'num': [1, 2, 1, 3, 4, 2, 1]},
                  index=[pd.Timestamp('20130101 09:00:00'),
                         pd.Timestamp('20130101 09:00:01'),
                         pd.Timestamp('20130101 09:00:02'),
                         pd.Timestamp('20130101 09:00:03'),
                         pd.Timestamp('20130101 09:00:04'),
                         pd.Timestamp('20130101 09:00:05'),
                         pd.Timestamp('20130101 09:00:06')])
print(df)
for name, group in df.groupby(['1', '2']):
    print(name)
    print(group)
print(df.groupby(['1', '2'])['num'].rolling(window='3s', closed='both').sum())

运行结果：
                      1   2  num
2013-01-01 09:00:00  A1  B1    1
2013-01-01 09:00:01  A2  B1    2
2013-01-01 09:00:02  A1  B1    1
2013-01-01 09:00:03  A2  B1    3
2013-01-01 09:00:04  A2  B1    4
2013-01-01 09:00:05  A1  B1    2
2013-01-01 09:00:06  A2  B1    1
('A1', 'B1')
                      1   2  num
2013-01-01 09:00:00  A1  B1    1
2013-01-01 09:00:02  A1  B1    1
2013-01-01 09:00:05  A1  B1    2
('A2', 'B1')
                      1   2  num
2013-01-01 09:00:01  A2  B1    2
2013-01-01 09:00:03  A2  B1    3
2013-01-01 09:00:04  A2  B1    4
2013-01-01 09:00:06  A2  B1    1
1   2                      
A1  B1  2013-01-01 09:00:00    1.0
        2013-01-01 09:00:02    2.0
        2013-01-01 09:00:05    3.0
A2  B1  2013-01-01 09:00:01    2.0
        2013-01-01 09:00:03    5.0
        2013-01-01 09:00:04    9.0
        2013-01-01 09:00:06    8.0
Name: num, dtype: float64

Process finished with exit code 0

df.groupby + df[].iat对该列最后一个求和

# -*- coding: utf-8 -*-
# version: Python 3.7.0
import pandas as pd

df = pd.read_excel('./Book1.xlsm',usecols=[0,1,2],skiprows=2)
def func(df):
    df['合同金额'].iat[-1] = df['每期租金支出不含税'].sum()
    return df
df.groupby('SEQ_NUM').apply(func)

4种方法带你学会Ubuntu软件的安装

前言在嵌入式的开发中，不会用在linux中开发，linux只是起到一个软件编译的效果，但是在其她领域的开发中，Ubuntu的软件安装还是很有必要的。在Linux的早期，安装软件是一件痛苦的事。幸好Linux开发人员已经通过把

File类（20210906）

一.构建file对象构建file对象也就是将file文件加载到java的变量中。 1.全名称（全路径）构建

VMware Fusion使用共享模式下虚拟机断网的解决方案

VMware Fusion 下虚拟机使用NAT共享网络时时常不间断断网，每次都需要把虚拟机关掉，重新启动VMware Funsion才可以打开网络。研究了下，发现实际也可以通过命令行来重启NAT，无需退出VMware Funsion.

HTML应用开发基础篇 - NodeJS

2017-05-11 DBA日记，导致log file sync单次等待超500ms又一可能

现象描述：在DB1数据库的lgwr日志里发现lgwr写入小量数据（1kb或几十KB）却延时严重（500ms以上）问题：为什么redo log写入小量数

Akka使用入门

一Akka简单介绍

eclipse无法导入import pygame解决方法

环境：本人的PYthon开发环境为Eclipse3.2, PyDev插件。Python版本为3.1版本。下载pygame-1.9.1.win32-py3.1.msi直接进行安装。在python第三方库目录C:/Python31/Li

从零开始学自动驾驶（1）——软件框架篇

# 在此首先感谢任佬的无私分享，有兴趣的同学可以去任佬的知乎进行学习从零开始做自动驾驶定位

海天味业智能化生产，酿造原汁原味的海天酱油

幼时的记忆中，有提着空瓶去打酱油的场景。在小商店，掀开酱油缸就能够闻到清香的酱油味道，然后商店老板用竹筒从酱油缸中提出满满的酱油，通过漏斗缓缓倒入酱油瓶中。随着时间的变化，人们的生活水平在显著提高，散装的酱油早已经离我们远去，人们在选择酱油的时候，也越来越注重品牌与品质。品牌宣传

[转载] python定义整型常量_Python笔记——数据类型、变量和常量

参考链接： Python变量，常量和文字数据类型计算机顾名思义就是可以做数学计