1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
| # coding: utf-8
import numpy as np import pandas as pd from pandas import Series, DataFrame
# 有这些年龄段 ages = [20, 22, 25, 27, 21, 23, 37, 61, 78]
# 根据这个条件分组 groups = [18, 25, 35, 60]
# 对数据进行分组 cats = pd.cut(ages, groups) print cats ''' [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], (18, 25], (35, 60], NaN, NaN] Categories (3, interval[int64]): [(18, 25] < (25, 35] < (35, 60]] '''
print cats.labels ''' [ 0 0 0 1 0 0 2 -1 -1] '''
# 统计一下个数,每个年龄阶段的个数有多少 print pd.value_counts(cats) ''' (18, 25] 5 (35, 60] 1 (25, 35] 1 dtype: int64 '''
#group_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior'] #print pd.cut(ages, groups, labels=group_names) ''' 报错:'Bin labels must be one fewer than ' ValueError: Bin labels must be one fewer than the number of bin edges '''
# 长度为20的随机数组 data = np.random.rand(20) print data ''' [0.10734531 0.25739656 0.57500249 0.26341356 0.06755529 0.31844072 0.40825376 0.63144798 0.34930756 0.28772671 0.82629089 0.4465668 0.4369444 0.01694405 0.98986687 0.4619442 0.58291355 0.82963555 0.75812264 0.75970419] '''
# 将2个为一个阶段划分 print pd.cut(data, 4, precision=2) ''' [(0.77, 0.99], (0.54, 0.77], (0.77, 0.99], (0.32, 0.54], (0.32, 0.54], ..., (0.77, 0.99], (0.77, 0.99], (0.094, 0.32], (0.77, 0.99], (0.77, 0.99]] Length: 20 Categories (4, interval[float64]): [(0.094, 0.32] < (0.32, 0.54] < (0.54, 0.77] < (0.77, 0.99]]
'''
data = np.random.randn(1000) cats = pd.qcut(data, 4) print cats ''' [(-3.936, -0.682], (-0.682, -0.0449], (-0.0449, 0.637], (-0.682, -0.0449], (-0.682, -0.0449], ..., (0.637, 3.161], (-0.682, -0.0449], (-0.0449, 0.637], (-0.0449, 0.637], (-0.682, -0.0449]] Length: 1000 Categories (4, interval[float64]): [(-3.936, -0.682] < (-0.682, -0.0449] < (-0.0449, 0.637] < (0.637, 3.161]] '''
# 统计一下 print pd.value_counts(cats) ''' (0.635, 3.356] 250 (-0.0177, 0.635] 250 (-0.697, -0.0177] 250 (-2.877, -0.697] 250 dtype: int64 '''
|