-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlog_bin_CN_2016.py
250 lines (229 loc) · 9.44 KB
/
log_bin_CN_2016.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
'''
################################################################################
# James Clough 2015
# 2016 Complexity & Networks course
#
# log_bin.py v1.3
# 14/01/2016
#
# Usage instructions for log binning:
#
# to import the log_bin functions:
# from log_bin import *
#
# to use the log_bin function:
# arguments:
# data - list - the numbers we want to get a probability distribution of
# bin_start - float - the left edge of the first bin
# first_bin_width - float - the width of the first bin
# a - float - growth factor - each bin is a times larger than the bin to its left
# datatype - string - either 'float' or 'integer' so we know how to normalise bin widths
#
# returns:
# centres - list - the centre of each bin (in log space - ie. geometric mean)
# counts - list - the probability of data being in each
#
#
# Examples - available below
#
# - test_float() - Generates some power-law distributed real numbers and log-
# bins them. The blue points are the original numbers - the red
# line is the log-binning - we can see how useful it is because
# we get a few more orders of mangnitude of data
#
# - test_int() - As above, but with integers
#
# - test_too_low_a() - Shows a plot where the growth factor, a, is too low
# and there is a kink in the line due to there being too
# few points in the bin
#
# Notes:
#
# - The data will be binned with the width of bins growing exponentially so that
# even when there is less than one expected point per value we can still
# estimate the probability distribution.
#
# - It matters whether your data are real numbers, or integers as this will change
# how we should normalise the height of each bin - use the 'datatype' argument
# to tell the function whether you are using integers or real numbers
# This is not asking whether the data you have provided are Python integers, or floats
# but rather whether you want to treat the data as integers or floats for the
# purpose of the log binning.
#
# - Make sure to use a sensible value for 'a', which is how much larger each bin
# is than the bin to its left. If a is too small then you will have too many
# bins and too few points per bin. If it is too large then there won't be
# many bins.
#
# - If you want to have negative values on the x-axis then something will break
# when finding the centre of the bin that contains 0 - if you need to do this
# you will need a workaround as log-binning negative data is not well defined.
#
# - The solution implemented here is not the most efficient one possible.
# possible. Feel free to tinker with this code to make it better.
#
# - There may be some bugs here - please email me if you find one.
################################################################################
'''
################################################################################
import numpy as np
import random
################################################################################
# BINNING FUNCTIONS
################################################################################
# frequency
#
# arguments:
# data - list of values
#
# returns:
# values - list of values
# frequency - list of frequencies of values
################################################################################
def frequency(data):
from collections import Counter
c = Counter(data)
return np.array([float(x) for x in c.keys()]), np.array([float(x) for x in c.values()])
################################################################################
# lin_bin
# linear binning of data
#
# arguments:
# data - list - the numbers we want to get the probability distribution of
# num_bins - integer - the number of bins to use
#
# returns:
# bin_centres - list - the centres of each bin (arithmetic mean)
# normed_counts - list - the probability of data being in each bin
################################################################################
def lin_bin(data, num_bins):
data = np.array(data)
num_datapoints = float(len(data))
counts, bin_edges = np.histogram(data, num_bins)
bin_centres = (bin_edges[1:] + bin_edges[:-1])/2
normed_counts = counts/num_datapoints
return bin_centres, normed_counts
################################################################################
# log_bin
# log bin data of either integers, or real numbers
#
# arguments:
# data - list - the numbers we want to get a probability distribution of
# bin_start - float - the left edge of the first bin
# first_bin_width - float - the width of the first bin
# a - float - growth factor - each bin is a times larger than the bin to its left
# datatype - string - either 'float' or 'integer' so we know how to normalise bin widths
# drop_zeros - boolean - if True, zeros are not counted in the normalisation of the probability distribution
#
# returns:
# centres - list - the centre of each bin (in log space - ie. geometric mean)
# counts - list - the probability of data being in each bin
################################################################################
def log_bin(data, bin_start=1., first_bin_width=1., a=2., datatype='float', drop_zeros=True, debug_mode=False):
# check datatype is valid
valid_datatypes = ('float', 'integer')
if datatype not in valid_datatypes:
print 'datatype must be either "float" or "integer"'
exit(0)
# ensure data is numpy array of floats
if drop_zeros:
data = np.array([float(x) for x in data if x!=0])
else:
data = np.array([float(x) for x in data])
num_datapoints = len(data)
min_x, max_x = min(data), max(data)
# create array of the edges of the bins beginning with the left edge of the
# leftmost bin, and ending with the right edge of the rightmost
bin_width = first_bin_width
bins = [bin_start]
new_edge = bin_start
while new_edge <= max_x:
last_edge = new_edge
new_edge = last_edge + bin_width
bins.append(new_edge)
bin_width *= a
# find how many datapoints are in each bin
# counts[i] is how many points are there in the bin whose left edge is bins[i]
indices = np.digitize(data, bins[1:])
counts = [0. for x in bins[1:]]
for i in indices:
counts[i] += 1./num_datapoints
# normalise number of datapoints by the width of the bin
# how we do this depends on whether we are binning integers or real numbers
# by width - we mean the amount of possible values that can fall in that bin
# we want to give a 'centre' of the bin to plot its height from
# if the data is approximately power law distributed then the most
# representative would be the geometric mean, since in it is in the middle
# in log space
bin_indices = range(len(bins)-1)
if datatype == 'float':
widths = [bins[i+1] - bins[i] for i in bin_indices]
centres = [np.sqrt(bins[i+1] * bins[i]) for i in bin_indices]
else:
widths = [np.ceil(bins[i+1]) - np.ceil(bins[i]) for i in bin_indices]
integers_in_each_bin = [range(int(np.ceil(bins[i])), int(np.ceil(bins[i+1]))) for i in bin_indices]
centres = [geometric_mean(x) for x in integers_in_each_bin]
widths = np.array(widths)
counts /= widths
# print out some values to help with debugging
if debug_mode:
print 'DATA - %s' % data[:10]
print 'BINS - %s' % bins[:10]
print 'INDICES - %s' % indices[:10]
print 'COUNTS - %s' % counts[:10]
print 'WIDTHS - %s' % widths[:10]
try:
print 'INTEGERS IN EACH BIN - %s' % integers_in_each_bin[:10]
except:
pass
print 'CENTRES - %s' % centres[:10]
return centres, counts
# returns the geometric mean of a list
def geometric_mean(x):
s = len(x)
y = [np.log(z) for z in x]
z = sum(y)/s
return np.exp(z)
################################################################################
# EXAMPLES
################################################################################
# generate some test data that is power law distributed
def generate_power_law_data(gamma, N):
x = np.random.random(N)
y = 1./(1-x)
y = y**(1./gamma)
y.sort()
return y
# log bin some data
def test_float(N=100000., a=1.5):
import matplotlib.pyplot as plt
x = generate_power_law_data(1, N)
vals, counts = lin_bin(x, int(max(x)))
b, c = log_bin(x, 1., 1., a, debug_mode=True)
plt.loglog(vals, counts, 'bx')
plt.loglog(b, c, 'r-')
plt.show()
# now with integers
def test_int(N=100000., a=1.5):
import matplotlib.pyplot as plt
x = generate_power_law_data(1, N)
x = np.array([int(z) for z in x])
vals, counts = frequency(x)
b, c = log_bin(x, 1., 1.5, a, 'integer', debug_mode=True)
plt.loglog(vals, counts/N, 'bx')
plt.loglog(b, c, 'r-')
plt.show()
# log bin some data - the very low growth factor gives a kink in the line
# this happens when there are too few points in each bin
def test_too_low_a():
test_float(a=1.01)
def test_normalisation(N=10000.):
import matplotlib.pyplot as plt
x = generate_power_law_data(1, N)
x = np.concatenate([x, [0. for z in range(int(N))]])
vals, counts = lin_bin(x, int(max(x)))
b, c = log_bin(x, 1.0, 1.5, 1.5, 'float', drop_zeros=True)
plt.loglog(vals, counts, 'bx')
plt.loglog(b, c, 'r-')
plt.show()