#!/usr/bin/env Python3
__author__ = '未昔/angelfate'
__date__ = '2019/8/6 10:52'
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
"""
数据结构 DataFrame
创建,读写
"""
print('用字典生成DataFrame,key为列名(默认key是无序的)。')
data = {
'name': ['wang', 'wei', 'RPA', 'python', 'linux', 'C'],
'year': [2000, 2010, 2020, 2030, 2040, 2050],
'pop': [1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
}
print('\n')
print(pd.DataFrame(data)) # 转换为 DataFrame
print('----指定索引顺序----')
print(pd.DataFrame(data, columns= ['name', 'year', 'pop']) ) # 指定索引顺序
print('----指定索引,不存在的列,默认使用数据NaN----')
data2 = pd.DataFrame(data,
columns = ['name', 'year', 'pop', 'state', 'con'],
index = ['one', 'two', 'three', 'four', 'five', 'six']
)
print(data2)
print('----查询指定 列索引 数据---')
print(data2['name']) # 查询指定 列索引 数据
print(data2.year)
print('----查询指定 行索引 数据----')
print(data2.ix['two']) # 查询指定 行索引 数据
print('----修改列值----')
data2['state'] = 16.5 # 修改一整列,为指定数据
print(data2)
data2.con = np.arange(6) # 用numpy数组修改元素
print(data2)
print('\n')
print('用Serice指定要修改的索引及对应值,没有指定的数据用Nan。')
val = pd.Series([-1.0, -1.1, -1.2], index = ['two', 'three', 'five'])
data2['con'] = val
print(data2)
print('\n')
print('---赋值给新列---')
data2['new_data'] = (data2.name == 'wang') #
print(data2)
print(data2.columns) # 打印列名
print('\n')
print('---DataFrame转置---')
values = {'Nevada': {2001:1.1, 2002:2.2},
'Ohio': {2000:1.5, 2001:1.7, 2002:3.6}}
data3 = pd.DataFrame(values)
print(data3)
print(data3.T) # 行列转置
print('\n')
print('---指定索引顺序,使用切片初始化顺序---')
print(pd.DataFrame(values, index = [2001, 2002, 2003]))
data4 = {'Ohio':data3['Ohio'][:-1], 'Nevada':data3['Nevada'][:2]}
print(pd.DataFrame(data4))
print('---指定索引和列名---')
data3.index.name = 'year'
data3.columns.name = 'state'
print(data3)
print(data3.values) # 只打印数据
print(data2.values)
结果
用字典生成DataFrame,key为列名(默认key是无序的)。
name pop year
0 wang 1.0 2000
1 wei 1.1 2010
2 RPA 1.2 2020
3 python 1.3 2030
4 linux 1.4 2040
5 C 1.5 2050
----指定索引顺序----
name year pop
0 wang 2000 1.0
1 wei 2010 1.1
2 RPA 2020 1.2
3 python 2030 1.3
4 linux 2040 1.4
5 C 2050 1.5
----指定索引,不存在的列,默认使用数据NaN----
name year pop state con
one wang 2000 1.0 NaN NaN
two wei 2010 1.1 NaN NaN
three RPA 2020 1.2 NaN NaN
four python 2030 1.3 NaN NaN
five linux 2040 1.4 NaN NaN
six C 2050 1.5 NaN NaN
----查询指定 列索引 数据---
one wang
two wei
three RPA
four python
five linux
six C
Name: name, dtype: object
one 2000
two 2010
three 2020
four 2030
five 2040
six 2050
Name: year, dtype: int64
----查询指定 行索引 数据----
name wei
year 2010
pop 1.1
state NaN
con NaN
Name: two, dtype: object
----修改列值----
name year pop state con
one wang 2000 1.0 16.5 NaN
two wei 2010 1.1 16.5 NaN
three RPA 2020 1.2 16.5 NaN
four python 2030 1.3 16.5 NaN
five linux 2040 1.4 16.5 NaN
six C 2050 1.5 16.5 NaN
name year pop state con
one wang 2000 1.0 16.5 0
two wei 2010 1.1 16.5 1
three RPA 2020 1.2 16.5 2
four python 2030 1.3 16.5 3
five linux 2040 1.4 16.5 4
six C 2050 1.5 16.5 5
用Serice指定要修改的索引及对应值,没有指定的数据用Nan。
name year pop state con
one wang 2000 1.0 16.5 NaN
two wei 2010 1.1 16.5 -1.0
three RPA 2020 1.2 16.5 -1.1
four python 2030 1.3 16.5 NaN
five linux 2040 1.4 16.5 -1.2
six C 2050 1.5 16.5 NaN
---赋值给新列---
name year pop state con new_data
one wang 2000 1.0 16.5 NaN True
two wei 2010 1.1 16.5 -1.0 False
three RPA 2020 1.2 16.5 -1.1 False
four python 2030 1.3 16.5 NaN False
five linux 2040 1.4 16.5 -1.2 False
six C 2050 1.5 16.5 NaN False
Index(['name', 'year', 'pop', 'state', 'con', 'new_data'], dtype='object')
---DataFrame转置---
Nevada Ohio
2000 NaN 1.5
2001 1.1 1.7
2002 2.2 3.6
2000 2001 2002
Nevada NaN 1.1 2.2
Ohio 1.5 1.7 3.6
---指定索引顺序,使用切片初始化顺序---
Nevada Ohio
2001 1.1 1.7
2002 2.2 3.6
2003 NaN NaN
Nevada Ohio
2000 NaN 1.5
2001 1.1 1.7
---指定索引和列名---
state Nevada Ohio
year
2000 NaN 1.5
2001 1.1 1.7
2002 2.2 3.6
[[nan 1.5]
[1.1 1.7]
[2.2 3.6]]
[['wang' 2000 1.0 16.5 nan True]
['wei' 2010 1.1 16.5 -1.0 False]
['RPA' 2020 1.2 16.5 -1.1 False]
['python' 2030 1.3 16.5 nan False]
['linux' 2040 1.4 16.5 -1.2 False]
['C' 2050 1.5 16.5 nan False]]
Process finished with exit code 0