#! /usr/bin/env python3.3
"""Calculating and dumping a histogram.
Used for internal instrumentation.
"""
import logging
import math
# pylint:disable=W9903
# non-gettext-ed string
# This is a debug dump, no L10N required.
LOG = logging.getLogger(__name__)
def _round(val):
"""Return some round-ish integer number like 5, 10, 25, etc."""
if val < 1:
return 1
e = pow(10, int(math.log10(val)))
m = val / e
# pylint: disable=multiple-statements
# More than one statement on a single line; keep tabular code tabular.
if m <= 1.0: m = 1.0
elif m <= 2.5: m = 2.5
elif m <= 5.0: m = 5.0
else: m = 10.0
return int(m * e)
def bucket_ends_linear(min_val, max_val, bucket_ct):
"""Return a list of maximum values, one for each bucket.
Often returns fewer than bucket_ct buckets due to choosing nice round
bucket sizes.
"""
val_range = max_val - min_val
if val_range < 1:
val_range = 1
step_width = _round(float(val_range) / float(bucket_ct))
if step_width < 1:
step_width = 1
step_ct = int(val_range / step_width)
if step_ct < 1:
step_ct = 1
if bucket_ct % step_width:
step_ct += 1
return [min_val + step_width * i for i in range(1, 1 + step_ct)]
def bucket_ends_logarithmic(min_val, max_val):
"""Return a list of maximum values, one for each bucket."""
min_pow10 = int(math.log10(min_val)) if 1 < min_val else 1
max_pow10 = 1 + int(math.log10(max_val)) if 1 < max_val else 1
bucket_ends = []
for pow10 in range(min_pow10, 1+max_pow10):
x10 = int(math.pow(10, pow10))
for k in [1, 2, 5]:
end = k * x10
bucket_ends.append(end)
if max_val < end:
# Early return if we only need "200" and not "500"
return bucket_ends
return bucket_ends
def to_histogram(coll, bucket_ct=10, bucket_prefix=[1, 2, 5], log=False):
"""How many elements of coll have a value of X?
bucket_prefix is a set of (usually very small-count) buckets to place at
the front of the bucket list. This helps dig deeper into long-tail
counts like the 500 different entries with a count of 1 or 2.
"""
# pylint: disable=dangerous-default-value
# Yes, it is indeed dangerous since the default value is NOT const and
# to_histogram() could (evil!) modify the contents of the default. But it
# doesn't. It treats it as const and only reads.
min_value = min(coll)
max_value = max(coll)
if log:
bucket_ends = bucket_ends_logarithmic(min_value, max_value)
else:
bucket_ends = bucket_ends_linear(min_value, max_value, bucket_ct)
# Force long-tail buckets:
for be in reversed(bucket_prefix):
if be < bucket_ends[0]:
bucket_ends = [be] + bucket_ends
hist = {be: 0 for be in bucket_ends}
for val in coll:
for be in bucket_ends:
if val <= be:
hist[be] += 1
break
return hist
def bar_of_stars(nom, denom, max_stars=60):
"""Return a string of *, suitable for a bar for a bar graph."""
if not (denom and max_stars):
return ''
ct = float(nom) / float(denom) * float(max_stars)
return '*' * int(ct + 0.5)
def digit_count(n):
"""How many digits in n, including space for commas?"""
if n < 1:
return 1
return len("{:,d}".format(n))
def to_lines(histo):
"""Return a list of lines that bring out a histogram."""
max_val = max(histo.values())
bucket_ends = sorted(histo.keys())
be_fmt_width = str(digit_count(bucket_ends[-1]))
val_fmt_width = str(digit_count(max_val))
fmt = ( "{bb:" + be_fmt_width + ",d}-"
+ "{be:" + be_fmt_width + ",d} : "
"{val:" + val_fmt_width + ",d} {bar}")
bb = 0
lines = []
for be in bucket_ends:
val = histo[be]
_bar = bar_of_stars(val, max_val)
s = fmt.format(bb=bb, be=be, val=histo[be], bar=_bar)
lines.append(s)
bb = be + 1
return lines