from scipy import stats import matplotlib.pyplot as plt # First some basic examples: # Create two evenly spaced bins in the range of the given sample, and sum the # corresponding values in each of those bins: values = [1.0, 1.0, 2.0, 1.5, 3.0] stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2) # (array([ 4. , 4.5]), array([ 1., 4., 7.]), array([1, 1, 1, 2, 2])) # Multiple arrays of values can also be passed. The statistic is calculated # on each set independently: values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]] stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2) # (array([[ 4. , 4.5], [ 8. , 9. ]]), array([ 1., 4., 7.]), # array([1, 1, 1, 2, 2])) stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', bins=3) # (array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]), # array([1, 2, 1, 2, 3])) # As a second example, we now generate some random data of sailing boat speed # as a function of wind speed, and then determine how fast our boat is for # certain wind speeds: windspeed = 8 * np.random.rand(500) boatspeed = .3 * windspeed**.5 + .2 * np.random.rand(500) bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed, boatspeed, statistic='median', bins=[1,2,3,4,5,6,7]) plt.figure() plt.plot(windspeed, boatspeed, 'b.', label='raw data') plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5, label='binned statistic of data') plt.legend() # Now we can use ``binnumber`` to select all datapoints with a windspeed # below 1: low_boatspeed = boatspeed[binnumber == 0] # As a final example, we will use ``bin_edges`` and ``binnumber`` to make a # plot of a distribution that shows the mean and distribution around that # mean per bin, on top of a regular histogram and the probability # distribution function: x = np.linspace(0, 5, num=500) x_pdf = stats.maxwell.pdf(x) samples = stats.maxwell.rvs(size=10000) bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf, statistic='mean', bins=25) bin_width = (bin_edges[1] - bin_edges[0]) bin_centers = bin_edges[1:] - bin_width/2 plt.figure() plt.hist(samples, bins=50, normed=True, histtype='stepfilled', alpha=0.2, label='histogram of data') plt.plot(x, x_pdf, 'r-', label='analytical pdf') plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2, label='binned statistic of data') plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5) plt.legend(fontsize=10) plt.show()