数据结构 DataFrame

#!/usr/bin/env Python3
__author__ = '未昔/angelfate'
__date__ = '2019/8/6 10:52'
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

"""
数据结构 DataFrame
创建,读写
"""
print('用字典生成DataFrame,key为列名(默认key是无序的)。')
data = {
    'name': ['wang', 'wei', 'RPA', 'python', 'linux', 'C'],
    'year': [2000, 2010, 2020, 2030, 2040, 2050],
    'pop': [1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
}
print('\n')

print(pd.DataFrame(data))  # 转换为 DataFrame
print('----指定索引顺序----')
print(pd.DataFrame(data, columns= ['name', 'year', 'pop']) )  # 指定索引顺序

print('----指定索引,不存在的列,默认使用数据NaN----')
data2 = pd.DataFrame(data,
                     columns = ['name', 'year', 'pop', 'state', 'con'],
                     index = ['one', 'two', 'three', 'four', 'five', 'six']
)
print(data2)
print('----查询指定 列索引 数据---')
print(data2['name']) # 查询指定 列索引 数据
print(data2.year)
print('----查询指定 行索引 数据----')
print(data2.ix['two']) # 查询指定 行索引 数据
print('----修改列值----')
data2['state'] = 16.5  # 修改一整列,为指定数据
print(data2)
data2.con = np.arange(6)  # 用numpy数组修改元素
print(data2)
print('\n')

print('用Serice指定要修改的索引及对应值,没有指定的数据用Nan。')
val = pd.Series([-1.0, -1.1, -1.2], index = ['two', 'three', 'five'])
data2['con'] = val
print(data2)
print('\n')

print('---赋值给新列---')
data2['new_data'] = (data2.name == 'wang') #
print(data2)
print(data2.columns) # 打印列名
print('\n')

print('---DataFrame转置---')
values = {'Nevada': {2001:1.1, 2002:2.2},
          'Ohio': {2000:1.5, 2001:1.7, 2002:3.6}}
data3 = pd.DataFrame(values)
print(data3)
print(data3.T)  # 行列转置
print('\n')

print('---指定索引顺序,使用切片初始化顺序---')
print(pd.DataFrame(values, index = [2001, 2002, 2003]))
data4 = {'Ohio':data3['Ohio'][:-1], 'Nevada':data3['Nevada'][:2]}
print(pd.DataFrame(data4))

print('---指定索引和列名---')
data3.index.name = 'year'
data3.columns.name = 'state'
print(data3)
print(data3.values) # 只打印数据
print(data2.values)

结果

用字典生成DataFrame,key为列名(默认key是无序的)。


     name  pop  year
0    wang  1.0  2000
1     wei  1.1  2010
2     RPA  1.2  2020
3  python  1.3  2030
4   linux  1.4  2040
5       C  1.5  2050
----指定索引顺序----
     name  year  pop
0    wang  2000  1.0
1     wei  2010  1.1
2     RPA  2020  1.2
3  python  2030  1.3
4   linux  2040  1.4
5       C  2050  1.5
----指定索引,不存在的列,默认使用数据NaN----
         name  year  pop state  con
one      wang  2000  1.0   NaN  NaN
two       wei  2010  1.1   NaN  NaN
three     RPA  2020  1.2   NaN  NaN
four   python  2030  1.3   NaN  NaN
five    linux  2040  1.4   NaN  NaN
six         C  2050  1.5   NaN  NaN
----查询指定 列索引 数据---
one        wang
two         wei
three       RPA
four     python
five      linux
six           C
Name: name, dtype: object
one      2000
two      2010
three    2020
four     2030
five     2040
six      2050
Name: year, dtype: int64
----查询指定 行索引 数据----
name      wei
year     2010
pop       1.1
state     NaN
con       NaN
Name: two, dtype: object
----修改列值----
         name  year  pop  state  con
one      wang  2000  1.0   16.5  NaN
two       wei  2010  1.1   16.5  NaN
three     RPA  2020  1.2   16.5  NaN
four   python  2030  1.3   16.5  NaN
five    linux  2040  1.4   16.5  NaN
six         C  2050  1.5   16.5  NaN
         name  year  pop  state  con
one      wang  2000  1.0   16.5    0
two       wei  2010  1.1   16.5    1
three     RPA  2020  1.2   16.5    2
four   python  2030  1.3   16.5    3
five    linux  2040  1.4   16.5    4
six         C  2050  1.5   16.5    5


用Serice指定要修改的索引及对应值,没有指定的数据用Nan。
         name  year  pop  state  con
one      wang  2000  1.0   16.5  NaN
two       wei  2010  1.1   16.5 -1.0
three     RPA  2020  1.2   16.5 -1.1
four   python  2030  1.3   16.5  NaN
five    linux  2040  1.4   16.5 -1.2
six         C  2050  1.5   16.5  NaN


---赋值给新列---
         name  year  pop  state  con  new_data
one      wang  2000  1.0   16.5  NaN      True
two       wei  2010  1.1   16.5 -1.0     False
three     RPA  2020  1.2   16.5 -1.1     False
four   python  2030  1.3   16.5  NaN     False
five    linux  2040  1.4   16.5 -1.2     False
six         C  2050  1.5   16.5  NaN     False
Index(['name', 'year', 'pop', 'state', 'con', 'new_data'], dtype='object')


---DataFrame转置---
      Nevada  Ohio
2000     NaN   1.5
2001     1.1   1.7
2002     2.2   3.6
        2000  2001  2002
Nevada   NaN   1.1   2.2
Ohio     1.5   1.7   3.6


---指定索引顺序,使用切片初始化顺序---
      Nevada  Ohio
2001     1.1   1.7
2002     2.2   3.6
2003     NaN   NaN
      Nevada  Ohio
2000     NaN   1.5
2001     1.1   1.7
---指定索引和列名---
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      1.1   1.7
2002      2.2   3.6
[[nan 1.5]
 [1.1 1.7]
 [2.2 3.6]]
[['wang' 2000 1.0 16.5 nan True]
 ['wei' 2010 1.1 16.5 -1.0 False]
 ['RPA' 2020 1.2 16.5 -1.1 False]
 ['python' 2030 1.3 16.5 nan False]
 ['linux' 2040 1.4 16.5 -1.2 False]
 ['C' 2050 1.5 16.5 nan False]]

Process finished with exit code 0