import pandas as pd





In [2]: import pandas as pd  In [3]: obj = pd.Series([4, 7, -5, 3])  In [4]: obj Out[4]:  0    4 1    7 2   -5 3    3 dtype: int64  In [5]: obj.values Out[5]: array([ 4,  7, -5,  3])  In [6]: obj.index Out[6]: Int64Index([0, 1, 2, 3], dtype='int64')


In [2]: obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])  In [3]: obj2 Out[3]:  d    4 b    7 a   -5 c    3 dtype: int64  In [4]: obj2.index Out[4]: Index(['d', 'b', 'a', 'c'], dtype='object')  In [10]: obj2['a'] Out[10]: -5  In [11]: obj2['d'] = 6  In [12]: obj2[['c', 'a', 'd']] Out[12]:  c    3 a   -5 d    6 dtype: int64



In [13]: obj2[obj2 > 0] Out[13]:  d    6 b    7 c    3 dtype: int64  In [14]: obj2 * 2 Out[14]:  d    12 b    14 a   -10 c     6 dtype: int64  In [15]: obj2 Out[15]:  d    6 b    7 a   -5 c    3 dtype: int64  In [17]: import numpy as np  In [18]: np.exp(obj2) Out[18]:  d     403.428793 b    1096.633158 a       0.006738 c      20.085537 dtype: float64  In [19]: 'b' in obj2 Out[19]: True  In [20]: 'e' in obj2 Out[20]: False


In [21]: sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}  In [22]: obj3 = pd.Series(sdata)  In [23]: obj3 Out[23]:  Ohio      35000 Oregon    16000 Texas     71000 Utah       5000 dtype: int64  In [24]: states = ['California', 'Ohio', 'Oregon', 'Texas']  In [25]: obj4 = pd.Series(sdata, index=states)  In [26]: obj4 Out[26]:  California        NaN Ohio          35000.0 Oregon        16000.0 Texas         71000.0 dtype: float64  In [27]: pd.isnull(obj4) Out[27]:  California     True Ohio          False Oregon        False Texas         False dtype: bool  In [28]: pd.notnull(obj4) Out[28]:  California    False Ohio           True Oregon         True Texas          True dtype: bool  In [29]: obj4.isnull() Out[29]:  California     True Ohio          False Oregon        False Texas         False dtype: bool  In [32]: obj4.notnull() Out[32]:  California    False Ohio           True Oregon         True Texas          True dtype: bool


In [33]: obj3 Out[33]:  Ohio      35000 Oregon    16000 Texas     71000 Utah       5000 dtype: int64  In [34]: obj4 Out[34]:  California        NaN Ohio          35000.0 Oregon        16000.0 Texas         71000.0 dtype: float64  In [35]: obj3 + obj4 Out[35]:  California         NaN Ohio           70000.0 Oregon         32000.0 Texas         142000.0 Utah               NaN dtype: float64  In [36]: obj4.name = 'population'  In [37]: obj4.index.name = 'state'  In [38]: obj4 Out[38]:  state California        NaN Ohio          35000.0 Oregon        16000.0 Texas         71000.0 Name: population, dtype: float64  In [40]: obj = pd.Series([4, 7, -5, 3])  In [41]: obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']  In [42]: obj Out[42]:  Bob      4 Steve    7 Jeff    -5 Ryan     3 dtype: int64



In [1]: import pandas as pd  In [2]: import numpy as np  In [3]:   In [3]: data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],    ...: 'year': [2000, 2001, 2002, 2001, 2002, 2003],    ...: 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}  In [4]:   In [4]: frame = pd.DataFrame(data)  In [5]: frame Out[5]:     pop   state  year 0  1.5    Ohio  2000 1  1.7    Ohio  2001 2  3.6    Ohio  2002 3  2.4  Nevada  2001 4  2.9  Nevada  2002 5  3.2  Nevada  2003  In [6]: frame.head() Out[6]:     pop   state  year 0  1.5    Ohio  2000 1  1.7    Ohio  2001 2  3.6    Ohio  2002 3  2.4  Nevada  2001 4  2.9  Nevada  2002  In [7]:   In [7]: pd.DataFrame(data, columns=['year', 'state', 'pop']) Out[7]:     year   state  pop 0  2000    Ohio  1.5 1  2001    Ohio  1.7 2  2002    Ohio  3.6 3  2001  Nevada  2.4 4  2002  Nevada  2.9 5  2003  Nevada  3.2  In [8]: frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],    ...: index=['one', 'two', 'three', 'four', 'five', 'six'])  In [9]: frame2 Out[9]:         year   state  pop debt one    2000    Ohio  1.5  NaN two    2001    Ohio  1.7  NaN three  2002    Ohio  3.6  NaN four   2001  Nevada  2.4  NaN five   2002  Nevada  2.9  NaN six    2003  Nevada  3.2  NaN  In [10]: frame2['state'] Out[10]:  one        Ohio two        Ohio three      Ohio four     Nevada five     Nevada six      Nevada Name: state, dtype: object

可见还可以通过columns指定DataFrame的列序, index指定索引名。跟Series一样,如果传入的列在数据中找不到,就会产生NaN值。通过类似字典的方式或属性的方式,可以将DataFrame的列获取为Series,返回的Series拥有DataFrame相同的索引,且其name属性也已经被相应地设置好。

行也可以用loc属性通过位置或名称的方式进行获取。列可以通过赋值的方式进行修改。 将列表或数组赋值给某个列时,其长度必须跟DataFrame的长度相匹配。如果赋值的是Series,就会精确匹配

In [11]: frame2.loc['three'] Out[11]:  year     2002 state    Ohio pop       3.6 debt      NaN Name: three, dtype: object  In [12]: frame2['debt'] = 16.5  In [13]: frame2 Out[13]:         year   state  pop  debt one    2000    Ohio  1.5  16.5 two    2001    Ohio  1.7  16.5 three  2002    Ohio  3.6  16.5 four   2001  Nevada  2.4  16.5 five   2002  Nevada  2.9  16.5 six    2003  Nevada  3.2  16.5  In [14]: frame2['debt'] = np.arange(6.)  In [15]: frame2 Out[15]:         year   state  pop  debt one    2000    Ohio  1.5   0.0 two    2001    Ohio  1.7   1.0 three  2002    Ohio  3.6   2.0 four   2001  Nevada  2.4   3.0 five   2002  Nevada  2.9   4.0 six    2003  Nevada  3.2   5.0  In [16]: val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])  In [17]: frame2['debt'] = val  In [18]: frame2 Out[18]:         year   state  pop  debt one    2000    Ohio  1.5   NaN two    2001    Ohio  1.7  -1.2 three  2002    Ohio  3.6   NaN four   2001  Nevada  2.4  -1.5 five   2002  Nevada  2.9  -1.7 six    2003  Nevada  3.2   NaN


In [19]: frame2['eastern'] = frame2['state'] == 'Ohio'  In [20]: frame2 Out[20]:         year   state  pop  debt  eastern one    2000    Ohio  1.5   NaN     True two    2001    Ohio  1.7  -1.2     True three  2002    Ohio  3.6   NaN     True four   2001  Nevada  2.4  -1.5    False five   2002  Nevada  2.9  -1.7    False six    2003  Nevada  3.2   NaN    False  In [21]: del frame2['eastern']  In [22]: frame2.columns Out[22]: Index(['year', 'state', 'pop', 'debt'], dtype='object')

通过索引方式返回的列只是相应数据的视图而不是副本。因此,对返回的Series所做的任何就地修改 全都会反映到源DataFrame上。通过Series的copy方法即可显式地复制列。


In [23]: pop = {'Nevada': {2001: 2.4, 2002: 2.9},    ....: 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}  In [24]: frame3 = pd.DataFrame(pop)  In [25]: frame3 Out[25]:        Nevada  Ohio 2000     NaN   1.5 2001     2.4   1.7 2002     2.9   3.6  In [26]: frame3.T Out[26]:          2000  2001  2002 Nevada   NaN   2.4   2.9 Ohio     1.5   1.7   3.6  In [27]: pd.DataFrame(pop, index=[2001, 2002, 2003]) Out[27]:        Nevada  Ohio 2001     2.4   1.7 2002     2.9   3.6 2003     NaN   NaN  In [28]: pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}  In [29]: pdata Out[29]:  {'Ohio': 2000    1.5  2001    1.7  Name: Ohio, dtype: float64, 'Nevada': 2000    NaN  2001    2.4  Name: Nevada, dtype: float64}  In [30]: pd.DataFrame(pdata) Out[30]:        Nevada  Ohio 2000     NaN   1.5 2001     2.4   1.7  In [31]: frame3.index.name = 'year'; frame3.columns.name = 'state'  In [32]: frame3 Out[32]:  state  Nevada  Ohio year                2000      NaN   1.5 2001      2.4   1.7 2002      2.9   3.6  In [33]: frame3.values Out[33]:  array([[ nan,  1.5],        [ 2.4,  1.7],        [ 2.9,  3.6]])  In [34]: frame2.values Out[34]:  array([[2000, 'Ohio', 1.5, nan],        [2001, 'Ohio', 1.7, -1.2],        [2002, 'Ohio', 3.6, nan],        [2001, 'Nevada', 2.4, -1.5],        [2002, 'Nevada', 2.9, -1.7],        [2003, 'Nevada', 3.2, nan]], dtype=object)


DataFrame的constructor接受的类型为:2D ndarray、dict of arrays, lists, or tuples、NumPy structured/record、array、dict of Series、dict of dicts、List of dicts or Series、List of lists or tuples、Another DataFrame、NumPy MaskedArray。

更多参考: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html



In [35]: obj = pd.Series(range(3), index=['a', 'b', 'c'])  In [36]: index = obj.index  In [37]: index Out[37]: Index(['a', 'b', 'c'], dtype='object')  In [38]: index[1:] Out[38]: Index(['b', 'c'], dtype='object')  In [39]: index[1] = 'd' --------------------------------------------------------------------------- TypeError                                 Traceback (most recent call last) <ipython-input-39-676fdeb26a68> in <module>() ----> 1 index[1] = 'd'  /usr/local/lib/python3.5/dist-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)    1722     1723     def __setitem__(self, key, value): -> 1724         raise TypeError("Index does not support mutable operations")    1725     1726     def __getitem__(self, key):  TypeError: Index does not support mutable operations  In [40]: labels = pd.Index(np.arange(3))  In [41]: labels Out[41]: Int64Index([0, 1, 2], dtype='int64')  In [42]: obj2 = pd.Series([1.5, -2.5, 0], index=labels)  In [43]: obj2 Out[43]:  0    1.5 1   -2.5 2    0.0 dtype: float64  In [44]: obj2.index is labels Out[44]: True  In [45]: frame3 Out[45]:  state  Nevada  Ohio year                2000      NaN   1.5 2001      2.4   1.7 2002      2.9   3.6  In [46]: frame3.columns Out[46]: Index(['Nevada', 'Ohio'], dtype='object', name='state')  In [47]: 'Ohio' in frame3.columns Out[47]: True  In [48]: 2003 in frame3.index Out[48]: False  In [49]: dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])  In [50]: dup_labels Out[50]: Index(['foo', 'foo', 'bar', 'bar'], dtype='object')



更多参考: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.html




In [51]: obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])  In [52]: obj Out[52]:  d    4.5 b    7.2 a   -5.3 c    3.6 dtype: float64  # 调用reindex将会根据新索引进行重排。如果某个索引值当前不存在,就为NaN  In [53]: obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])  In [54]: obj2 Out[54]:  a   -5.3 b    7.2 c    3.6 d    4.5 e    NaN dtype: float64  In [55]: obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])  In [56]: obj3 Out[56]:  0      blue 2    purple 4    yellow dtype: object  # 对于时间序列这样的有序数据,重新索引时可能需要做插值处理。method选项即可达到此目的,例如,使用ffill以实现前向值填充:  In [57]: obj3.reindex(range(6), method='ffill') Out[57]:  0      blue 1      blue 2    purple 3    purple 4    yellow 5    yellow dtype: object  # DataFrame中reindex可以调整行列 In [58]: frame = pd.DataFrame(np.arange(9).reshape((3, 3)),    ....: index=['a', 'c', 'd'],    ....: columns=['Ohio', 'Texas', 'California'])  In [59]: frame Out[59]:     Ohio  Texas  California a     0      1           2 c     3      4           5 d     6      7           8  In [60]: frame2 = frame.reindex(['a', 'b', 'c', 'd'])  In [61]: frame2 Out[61]:     Ohio  Texas  California a   0.0    1.0         2.0 b   NaN    NaN         NaN c   3.0    4.0         5.0 d   6.0    7.0         8.0  In [62]: states = ['Texas', 'Utah', 'California']  In [63]: frame.reindex(columns=states) Out[63]:     Texas  Utah  California a      1   NaN           2 c      4   NaN           5 d      7   NaN           8  In [69]:  frame2 = frame.reindex(['a', 'b', 'c', 'd'],columns=states)  In [70]: frame2 Out[70]:     Texas  Utah  California a    1.0   NaN         2.0 b    NaN   NaN         NaN c    4.0   NaN         5.0 d    7.0   NaN         8.0


更多参考: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html



In [71]: obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])  In [72]: obj Out[72]:  a    0.0 b    1.0 c    2.0 d    3.0 e    4.0 dtype: float64  In [73]: new_obj = obj.drop('c')  In [74]: new_obj Out[74]:  a    0.0 b    1.0 d    3.0 e    4.0 dtype: float64  In [75]: obj Out[75]:  a    0.0 b    1.0 c    2.0 d    3.0 e    4.0 dtype: float64  In [76]: obj.drop(['d', 'c']) Out[76]:  a    0.0 b    1.0 e    4.0 dtype: float64  In [77]: obj Out[77]:  a    0.0 b    1.0 c    2.0 d    3.0 e    4.0 dtype: float64  In [78]: data = pd.DataFrame(np.arange(16).reshape((4, 4)),    ....: index=['Ohio', 'Colorado', 'Utah', 'New York'],    ....: columns=['one', 'two', 'three', 'four'])  In [79]: data Out[79]:            one  two  three  four Ohio        0    1      2     3 Colorado    4    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  In [80]: data.drop(['Colorado', 'Ohio']) Out[80]:            one  two  three  four Utah        8    9     10    11 New York   12   13     14    15  In []: data.drop('two',1) Out[57]:            one  three  four Ohio        0      2     3 Colorado    4      6     7 Utah        8     10    11 New York   12     14    15  In []: data.drop('two', axis=1) Out[58]:            one  three  four Ohio        0      2     3 Colorado    4      6     7 Utah        8     10    11 New York   12     14    15  In []: data.drop(['two', 'four'], axis='columns') Out[59]:            one  three Ohio        0      2 Colorado    4      6 Utah        8     10 New York   12     14  In []: obj.drop('c', inplace=True)  In []: obj Out[61]:  d    4.5 b    7.2 a   -5.3 dtype: float64



obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])  obj Out[63]:  a    0.0 b    1.0 c    2.0 d    3.0 dtype: float64  obj['b'] Out[64]: 1.0  obj[1] Out[65]: 1.0  obj[2:4] Out[66]:  c    2.0 d    3.0 dtype: float64  obj[['b', 'a', 'd']] Out[67]:  b    1.0 a    0.0 d    3.0 dtype: float64  obj[[1, 3]] Out[68]:  b    1.0 d    3.0 dtype: float64  obj[obj < 2] Out[69]:  a    0.0 b    1.0 dtype: float64  obj['b':'c'] Out[70]:  b    1.0 c    2.0 dtype: float64  obj['b':'c'] = 5  obj Out[72]:  a    0.0 b    5.0 c    5.0 d    3.0 dtype: float64


data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])  data Out[74]:            one  two  three  four Ohio        0    1      2     3 Colorado    4    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  data['two'] Out[75]:  Ohio         1 Colorado     5 Utah         9 New York    13 Name: two, dtype: int32  data[['three', 'one']] Out[76]:            three  one Ohio          2    0 Colorado      6    4 Utah         10    8 New York     14   12  data[:2] Out[77]:            one  two  three  four Ohio        0    1      2     3 Colorado    4    5      6     7  data[data['three'] > 5] Out[78]:            one  two  three  four Colorado    4    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  data < 5 Out[79]:              one    two  three   four Ohio       True   True   True   True Colorado   True  False  False  False Utah      False  False  False  False New York  False  False  False  False  data[data < 5] = 0  data Out[81]:            one  two  three  four Ohio        0    0      0     0 Colorado    0    5      6     7 Utah        8    9     10    11 New York   12   13     14    15
  • loc和iloc


data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])  data Out[74]:            one  two  three  four Ohio        0    1      2     3 Colorado    4    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  data['two'] Out[75]:  Ohio         1 Colorado     5 Utah         9 New York    13 Name: two, dtype: int32  data[['three', 'one']] Out[76]:            three  one Ohio          2    0 Colorado      6    4 Utah         10    8 New York     14   12  data[:2] Out[77]:            one  two  three  four Ohio        0    1      2     3 Colorado    4    5      6     7  data[data['three'] > 5] Out[78]:            one  two  three  four Colorado    4    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  data < 5 Out[79]:              one    two  three   four Ohio       True   True   True   True Colorado   True  False  False  False Utah      False  False  False  False New York  False  False  False  False  data[data < 5] = 0  data Out[81]:            one  two  three  four Ohio        0    0      0     0 Colorado    0    5      6     7 Utah        8    9     10    11 New York   12   13     14    15  data.loc['Colorado', ['two', 'three']] Out[82]:  two      5 three    6 Name: Colorado, dtype: int32  data.iloc[2, [3, 0, 1]] Out[83]:  four    11 one      8 two      9 Name: Utah, dtype: int32  data.iloc[2] Out[84]:  one       8 two       9 three    10 four     11 Name: Utah, dtype: int32  data.iloc[[1, 2], [3, 0, 1]] Out[85]:            four  one  two Colorado     7    0    5 Utah        11    8    9  data.loc[:'Utah', 'two'] Out[86]:  Ohio        0 Colorado    5 Utah        9 Name: two, dtype: int32  data.iloc[:, :3][data.three > 5] Out[87]:            one  two  three Colorado    0    5      6 Utah        8    9     10 New York   12   13     14



df[val] | 选择列 df.loc[val] | 选择行 df.loc[:, val] | 选择列 df.loc[val1, val2] | 选择行列 df.iloc[where] | 选择行 df.iloc[:, where] | 选择列 df.iloc[where_i, where_j] | 选择行列 df.at[label_i, label_j] | 选择值 df.iat[i, j] | 选择值 reindex method | 通过label选择多行或列 get_value, set_value | 通过label选择耽搁行或列

整数索引(Integer Indexes)


ser = pd.Series(np.arange(3.))  ser[-1] Traceback (most recent call last):    File "<ipython-input-20-3cbe0b873a9e>", line 1, in <module>     ser[-1]    File "C:\Users\andrew\AppData\Local\conda\conda\envs\my_root\lib\site-packages\pandas\core\series.py", line 601, in __getitem__     result = self.index.get_value(self, key)    File "C:\Users\andrew\AppData\Local\conda\conda\envs\my_root\lib\site-packages\pandas\core\indexes\base.py", line 2477, in get_value     tz=getattr(series.dtype, 'tz', None))    File "pandas\_libs\index.pyx", line 98, in pandas._libs.index.IndexEngine.get_value    File "pandas\_libs\index.pyx", line 106, in pandas._libs.index.IndexEngine.get_value    File "pandas\_libs\index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc    File "pandas\_libs\hashtable_class_helper.pxi", line 759, in pandas._libs.hashtable.Int64HashTable.get_item    File "pandas\_libs\hashtable_class_helper.pxi", line 765, in pandas._libs.hashtable.Int64HashTable.get_item  KeyError: -1  ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])  ser2[-1] Out[22]: 2.0  ser[:1] Out[23]:  0    0.0 dtype: float64  ser.loc[:1] Out[24]:  0    0.0 1    1.0 dtype: float64  ser.iloc[:1] Out[25]:  0    0.0 dtype: float64



s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])  s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])  s1 Out[28]:  a    7.3 c   -2.5 d    3.4 e    1.5 dtype: float64  s2 Out[29]:  a   -2.1 c    3.6 e   -1.5 f    4.0 g    3.1 dtype: float64  s1 + s2 Out[30]:  a    5.2 c    1.1 d    NaN e    0.0 f    NaN g    NaN dtype: float64  df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])  df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])  df1 Out[33]:              b    c    d Ohio      0.0  1.0  2.0 Texas     3.0  4.0  5.0 Colorado  6.0  7.0  8.0  df2 Out[34]:            b     d     e Utah    0.0   1.0   2.0 Ohio    3.0   4.0   5.0 Texas   6.0   7.0   8.0 Oregon  9.0  10.0  11.0  df1 + df2 Out[35]:              b   c     d   e Colorado  NaN NaN   NaN NaN Ohio      3.0 NaN   6.0 NaN Oregon    NaN NaN   NaN NaN Texas     9.0 NaN  12.0 NaN Utah      NaN NaN   NaN NaN  df1 = pd.DataFrame({'A': [1, 2]})  df2 = pd.DataFrame({'B': [3, 4]})  df1 Out[38]:     A 0  1 1  2  df2 Out[39]:     B 0  3 1  4  df1 - df2 Out[40]:      A   B 0 NaN NaN 1 NaN NaN


df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))  df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))  df1 Out[43]:       a    b     c     d 0  0.0  1.0   2.0   3.0 1  4.0  5.0   6.0   7.0 2  8.0  9.0  10.0  11.0  df2 Out[44]:        a     b     c     d     e 0   0.0   1.0   2.0   3.0   4.0 1   5.0   6.0   7.0   8.0   9.0 2  10.0  11.0  12.0  13.0  14.0 3  15.0  16.0  17.0  18.0  19.0  df2.loc[1, 'b'] = np.nan  df2 Out[46]:        a     b     c     d     e 0   0.0   1.0   2.0   3.0   4.0 1   5.0   NaN   7.0   8.0   9.0 2  10.0  11.0  12.0  13.0  14.0 3  15.0  16.0  17.0  18.0  19.0  df1 + df2 Out[47]:        a     b     c     d   e 0   0.0   2.0   4.0   6.0 NaN 1   9.0   NaN  13.0  15.0 NaN 2  18.0  20.0  22.0  24.0 NaN 3   NaN   NaN   NaN   NaN NaN  df1.add(df2, fill_value=0) Out[48]:        a     b     c     d     e 0   0.0   2.0   4.0   6.0   4.0 1   9.0   5.0  13.0  15.0   9.0 2  18.0  20.0  22.0  24.0  14.0 3  15.0  16.0  17.0  18.0  19.0  1 / df1 Out[49]:            a         b         c         d 0       inf  1.000000  0.500000  0.333333 1  0.250000  0.200000  0.166667  0.142857 2  0.125000  0.111111  0.100000  0.090909  df1.rdiv(1) Out[50]:            a         b         c         d 0       inf  1.000000  0.500000  0.333333 1  0.250000  0.200000  0.166667  0.142857 2  0.125000  0.111111  0.100000  0.090909   df1.reindex(columns=df2.columns, fill_value=0) Out[53]:       a    b     c     d  e 0  0.0  1.0   2.0   3.0  0 1  4.0  5.0   6.0   7.0  0 2  8.0  9.0  10.0  11.0  0

add, radd | for addition (+) sub, rsub | for subtraction (-) div, rdiv | for division (/) floordiv, rfloordiv | for floor division (//) mul, rmul | for multiplication () pow, rpow | for exponentiation (*)

  • DataFrame和Series间的操作

默认基于行进行广播,用( axis='index' or axis=0 )可以基于列进行广播。

arr = np.arange(12.).reshape((3, 4))  arr Out[55]:  array([[  0.,   1.,   2.,   3.],        [  4.,   5.,   6.,   7.],        [  8.,   9.,  10.,  11.]])  arr[0] Out[56]: array([ 0.,  1.,  2.,  3.])  arr - arr[0] Out[57]:  array([[ 0.,  0.,  0.,  0.],        [ 4.,  4.,  4.,  4.],        [ 8.,  8.,  8.,  8.]])  arr Out[58]:  array([[  0.,   1.,   2.,   3.],        [  4.,   5.,   6.,   7.],        [  8.,   9.,  10.,  11.]])  frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])  series = frame.iloc[0]  frame Out[61]:            b     d     e Utah    0.0   1.0   2.0 Ohio    3.0   4.0   5.0 Texas   6.0   7.0   8.0 Oregon  9.0  10.0  11.0  series Out[62]:  b    0.0 d    1.0 e    2.0 Name: Utah, dtype: float64  frame - series Out[63]:            b    d    e Utah    0.0  0.0  0.0 Ohio    3.0  3.0  3.0 Texas   6.0  6.0  6.0 Oregon  9.0  9.0  9.0  series2 = pd.Series(range(3), index=['b', 'e', 'f'])  series2 Out[65]:  b    0 e    1 f    2 dtype: int32  frame + series2 Out[66]:            b   d     e   f Utah    0.0 NaN   3.0 NaN Ohio    3.0 NaN   6.0 NaN Texas   6.0 NaN   9.0 NaN Oregon  9.0 NaN  12.0 NaN   series3 = frame['d']  frame Out[69]:            b     d     e Utah    0.0   1.0   2.0 Ohio    3.0   4.0   5.0 Texas   6.0   7.0   8.0 Oregon  9.0  10.0  11.0  series3 Out[70]:  Utah       1.0 Ohio       4.0 Texas      7.0 Oregon    10.0 Name: d, dtype: float64  frame.sub(series3, axis='index') Out[71]:            b    d    e Utah   -1.0  0.0  1.0 Ohio   -1.0  0.0  1.0 Texas  -1.0  0.0  1.0 Oregon -1.0  0.0  1.0







arr = np.arange(12.).reshape((3, 4))  arr Out[73]:  array([[  0.,   1.,   2.,   3.],        [  4.,   5.,   6.,   7.],        [  8.,   9.,  10.,  11.]])  arr[0] Out[74]: array([ 0.,  1.,  2.,  3.])  arr - arr[0] Out[75]:  array([[ 0.,  0.,  0.,  0.],        [ 4.,  4.,  4.,  4.],        [ 8.,  8.,  8.,  8.]])      frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])  frame Out[77]:                 b         d         e Utah    0.255395  1.983985  0.936326 Ohio    0.319394  2.231544 -0.051256 Texas  -0.041388 -0.026032 -0.446722 Oregon  1.099475 -1.432638 -0.919189  np.abs(frame) Out[78]:                 b         d         e Utah    0.255395  1.983985  0.936326 Ohio    0.319394  2.231544  0.051256 Texas   0.041388  0.026032  0.446722 Oregon  1.099475  1.432638  0.919189  f = lambda x: x.max() - x.min()  frame.apply(f) Out[80]:  b    1.140863 d    3.664181 e    1.855515 dtype: float64  frame.apply(f, axis='columns') Out[81]:  Utah      1.728590 Ohio      2.282800 Texas     0.420690 Oregon    2.532113 dtype: float64  def f(x):     return pd.Series([x.min(), x.max()], index=['min', 'max'])   frame.apply(f) Out[83]:              b         d         e min -0.041388 -1.432638 -0.919189 max  1.099475  2.231544  0.936326  format = lambda x: '%.2f' % x  frame.applymap(format) Out[85]:              b      d      e Utah     0.26   1.98   0.94 Ohio     0.32   2.23  -0.05 Texas   -0.04  -0.03  -0.45 Oregon   1.10  -1.43  -0.92  frame['e'].map(format) Out[86]:  Utah       0.94 Ohio      -0.05 Texas     -0.45 Oregon    -0.92 Name: e, dtype: object




obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])  obj.sort_index() Out[88]:  a    1 b    2 c    3 d    0 dtype: int32  frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],columns=['d', 'a', 'b', 'c'])  frame Out[90]:         d  a  b  c three  0  1  2  3 one    4  5  6  7  frame.sort_index() Out[91]:         d  a  b  c one    4  5  6  7 three  0  1  2  3  frame.sort_index(axis='columns') Out[94]:         a  b  c  d three  1  2  3  0 one    5  6  7  4


frame.sort_index(axis='columns', ascending=False) Out[95]:         d  c  b  a three  0  3  2  1 one    4  7  6  5 obj = pd.Series([4, 7, -3, 2])  obj.sort_values() Out[97]:  2   -3 3    2 0    4 1    7 dtype: int64  obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])  obj.sort_values() Out[99]:  4   -3.0 5    2.0 0    4.0 2    7.0 1    NaN 3    NaN dtype: float64  obj.sort_values(ascending=False) Out[100]:  2    7.0 0    4.0 5    2.0 4   -3.0 1    NaN 3    NaN dtype: float64


frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})  frame Out[102]:     a  b 0  0  4 1  1  7 2  0 -3 3  1  2  frame.sort_values(by='b') Out[103]:     a  b 2  0 -3 3  1  2 0  0  4 1  1  7  frame.sort_values(by=['a', 'b']) Out[104]:     a  b 2  0 -3 0  0  4 3  1  2 1  1  7




obj = pd.Series([7, -5, 7, 4, 2, 0, 4])  obj.rank() Out[106]:  0    6.5 1    1.0 2    6.5 3    4.5 4    3.0 5    2.0 6    4.5 dtype: float64  obj.rank(method='first') Out[107]:  0    6.0 1    1.0 2    7.0 3    4.0 4    3.0 5    2.0 6    5.0 dtype: float64  obj.rank(ascending=False, method='max') Out[108]:  0    2.0 1    7.0 2    2.0 3    4.0 4    5.0 5    6.0 6    4.0 dtype: float64  frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})  frame Out[110]:     a    b    c 0  0  4.3 -2.0 1  1  7.0  5.0 2  0 -3.0  8.0 3  1  2.0 -2.5  frame.rank(axis='columns') Out[111]:       a    b    c 0  2.0  3.0  1.0 1  1.0  3.0  2.0 2  2.0  1.0  3.0 3  2.0  3.0  1.0

Method | Description 'average' | Default: assign the average rank to each entry in the equal group 'min' | Use the minimum rank for the whole group 'max' | Use the maximum rank for the whole group 'first' | Assign ranks in the order the values appear in the data 'dense' | Like method='min' , but ranks always increase by 1 in between groups rather than the number of equal elements in a group



import pandas as pd  obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])  obj Out[3]:  a    0 a    1 b    2 b    3 c    4 dtype: int32  obj.index.is_unique Out[4]: False  obj['a'] Out[5]:  a    0 a    1 dtype: int32  obj['c'] Out[6]: 4  import numpy as np  df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])  df Out[10]:            0         1         2 a  0.835470  0.465657 -0.068212 a -1.067020  1.148283  1.722324 b  0.057184 -0.441111 -0.388286 b -0.363911 -0.599963  0.126594  df.loc['b'] Out[11]:            0         1         2 b  0.057184 -0.441111 -0.388286 b -0.363911 -0.599963  0.126594



df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],  index=['a', 'b', 'c', 'd'],columns=['one', 'two'])  df Out[14]:      one  two a  1.40  NaN b  7.10 -4.5 c   NaN  NaN d  0.75 -1.3  df.sum() Out[15]:  one    9.25 two   -5.80 dtype: float64  df.sum(axis='columns') Out[16]:  a    1.40 b    2.60 c    0.00 d   -0.55 dtype: float64  df.mean(axis='columns', skipna=False) Out[17]:  a      NaN b    1.300 c      NaN d   -0.275 dtype: float64  df.mean(axis='columns') Out[18]:  a    1.400 b    1.300 c      NaN d   -0.275 dtype: float64

Method | Description axis | Axis to reduce over; 0 for DataFrame’s rows and 1 for columns skipna | Exclude missing values; True by default level | Reduce grouped by level if the axis is hierarchically indexed (MultiIndex)


df Out[19]:      one  two a  1.40  NaN b  7.10 -4.5 c   NaN  NaN d  0.75 -1.3  df.idxmax() Out[20]:  one    b two    d dtype: object  df.cumsum() Out[21]:      one  two a  1.40  NaN b  8.50 -4.5 c   NaN  NaN d  9.25 -5.8  df.describe() Out[22]:              one       two count  3.000000  2.000000 mean   3.083333 -2.900000 std    3.493685  2.262742 min    0.750000 -4.500000 25%    1.075000 -3.700000 50%    1.400000 -2.900000 75%    4.250000 -2.100000 max    7.100000 -1.300000  obj = pd.Series(['a', 'a', 'b', 'c'] * 4)  obj.describe() Out[24]:  count     16 unique     3 top        a freq       8 dtype: object

Method | Description count | Number of non-NA values describe | Compute set of summary statistics for Series or each DataFrame column min, max | Compute minimum and maximum values argmin, argmax | Compute index locations (integers) at which minimum or maximum value obtained, respectively idxmin, idxmax | Compute index labels at which minimum or maximum value obtained, respectively quantile | Compute sample quantile ranging from 0 to 1 sum | Sum of values mean | Mean of values median | Arithmetic median (50% quantile) of values mad | Mean absolute deviation from mean value prod | Product of all values var | Sample variance of values std | Sample standard deviation of values skew | Sample skewness (third moment) of values kurt | Sample kurtosis (fourth moment) of values cumsum | Cumulative sum of values cummin, cummax | Cumulative minimum or maximum of values, respectively cumprod | Cumulative product of values diff | Compute first arithmetic difference (useful for time series) pct_change | Compute percent changes


一些汇总统计,如相关和方差,是从成对的参数程程。 让我们考虑一些来自Yahoo的股票价格和数量DataFrame! 使用附加的pandas-datareader包,




import pandas as pd  obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])  uniques = obj.unique()  uniques Out[9]: array(['c', 'a', 'd', 'b'], dtype=object)  obj.value_counts() Out[10]:  a    3 c    3 b    2 d    1 dtype: int64  pd.value_counts(obj.values, sort=False) Out[11]:  c    3 d    1 b    2 a    3 dtype: int64  obj Out[12]:  0    c 1    a 2    d 3    a 4    a 5    b 6    b 7    c 8    c dtype: object  mask = obj.isin(['b', 'c'])  mask Out[14]:  0     True 1    False 2    False 3    False 4    False 5     True 6     True 7     True 8     True dtype: bool  obj[mask] Out[15]:  0    c 5    b 6    b 7    c 8    c dtype: object  to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])  unique_vals = pd.Series(['c', 'b', 'a'])  pd.Index(unique_vals).get_indexer(to_match) Out[18]: array([0, 2, 1, 1, 0, 2], dtype=int64)  data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]})  data Out[20]:     Qu1  Qu2  Qu3 0    1    2    1 1    3    3    5 2    4    1    2 3    3    2    4 4    4    3    4  result = data.apply(pd.value_counts).fillna(0)  result Out[22]:     Qu1  Qu2  Qu3 1  1.0  1.0  1.0 2  0.0  2.0  1.0 3  2.0  2.0  0.0 4  2.0  0.0  2.0 5  0.0  0.0  1.0

Method | Description isin | Compute boolean array indicating whether each Series value is contained in the passed sequence of values match | Compute integer indices for each value in an array into another array of distinct values; helpful for data alignment and join-type operations unique | Compute array of unique values in a Series, returned in the order observed value_counts | Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order

