ports//mail/py-spambayes/work/spambayes-1.0.4/spambayes/Histogram.py

#! /usr/local/bin/python2.3
import math

from spambayes.Options import options

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


class Hist:
    """Simple histograms of float values."""

    # Pass None for lo and hi and it will automatically adjust to the min
    # and max values seen.
    # Note:  nbuckets can be passed for backward compatibility.  The
    # display() method can be passed a different nbuckets value.
    def __init__(self, nbuckets=options["TestDriver", "nbuckets"],
                 lo=0.0, hi=100.0):
        self.lo, self.hi = lo, hi
        self.nbuckets = nbuckets
        self.buckets = [0] * nbuckets
        self.data = []  # the raw data points
        self.stats_uptodate = False

    # Add a value to the collection.
    def add(self, x):
        self.data.append(x)
        self.stats_uptodate = False

    # Compute, and set as instance attrs:
    #     n         # of data points
    # The rest are set iff n>0:
    #     min       smallest value in collection
    #     max       largest value in collection
    #     median    midpoint
    #     mean
    #     pct       list of (percentile, score) pairs
    #     var       variance
    #     sdev      population standard deviation (sqrt(variance))
    # self.data is also sorted.
    def compute_stats(self):
        if self.stats_uptodate:
            return
        self.stats_uptodate = True
        data = self.data
        n = self.n = len(data)
        if n == 0:
            return
        data.sort()
        self.min = data[0]
        self.max = data[-1]
        if n & 1:
            self.median = data[n // 2]
        else:
            self.median = (data[n // 2] + data[(n-1) // 2]) / 2.0
        # Compute mean.
        # Add in increasing order of magnitude, to minimize roundoff error.
        if data[0] < 0.0:
            temp = [(abs(x), x) for x in data]
            temp.sort()
            data = [x[1] for x in temp]
            del temp
        sum = 0.0
        for x in data:
            sum += x
        mean = self.mean = sum / n
        # Compute variance.
        var = 0.0
        for x in data:
            d = x - mean
            var += d*d
        self.var = var / n
        self.sdev = math.sqrt(self.var)
        # Compute percentiles.
        self.pct = pct = []
        for p in options["TestDriver", "percentiles"]:
            assert 0.0 <= p <= 100.0
            # In going from data index 0 to index n-1, we move n-1 times.
            # p% of that is (n-1)*p/100.
            i = (n-1)*p/1e2
            if i < 0:
                # Just return the smallest.
                score = data[0]
            else:
                whole = int(i)
                frac = i - whole
                score = data[whole]
                if whole < n-1 and frac:
                    # Move frac of the way from this score to the next.
                    score += frac * (data[whole + 1] - score)
            pct.append((p, score))

    # Merge other into self.
    def __iadd__(self, other):
        self.data.extend(other.data)
        self.stats_uptodate = False
        return self

    def get_lo_hi(self):
        self.compute_stats()
        lo, hi = self.lo, self.hi
        if lo is None:
            lo = self.min
        if hi is None:
            hi = self.max
        return lo, hi

    def get_bucketwidth(self):
        lo, hi = self.get_lo_hi()
        span = float(hi - lo)
        return span / self.nbuckets

    # Set instance var nbuckets to the # of buckets, and buckets to a list
    # of nbuckets counts.
    def fill_buckets(self, nbuckets=None):
        if nbuckets is None:
            nbuckets = self.nbuckets
        if nbuckets <= 0:
            raise ValueError("nbuckets %g > 0 required" % nbuckets)
        self.nbuckets = nbuckets
        self.buckets = buckets = [0] * nbuckets

        # Compute bucket counts.
        lo, hi = self.get_lo_hi()
        bucketwidth = self.get_bucketwidth()
        for x in self.data:
            i = int((x - lo) / bucketwidth)
            if i >= nbuckets:
                i = nbuckets - 1
            elif i < 0:
                i = 0
            buckets[i] += 1

    # Print a histogram to stdout.
    # Also sets instance var nbuckets to the # of buckets, and
    # buckts to a list of nbuckets counts, but only if at least one
    # data point is in the collection.
    def display(self, nbuckets=None, WIDTH=61):
        if nbuckets is None:
            nbuckets = self.nbuckets
        if nbuckets <= 0:
            raise ValueError("nbuckets %g > 0 required" % nbuckets)
        self.compute_stats()
        n = self.n
        if n == 0:
            return
        print "%d items; mean %.2f; sdev %.2f" % (n, self.mean, self.sdev)
        print "-> <stat> min %g; median %g; max %g" % (self.min,
                                                       self.median,
                                                       self.max)
        pcts = ['%g%% %g' % x for x in self.pct]
        print "-> <stat> percentiles:", '; '.join(pcts)

        lo, hi = self.get_lo_hi()
        if lo > hi:
            return

        # hunit is how many items a * represents.  A * is printed for
        # each hunit items, plus any non-zero fraction thereof.
        self.fill_buckets(nbuckets)
        biggest = max(self.buckets)
        hunit, r = divmod(biggest, WIDTH)
        if r:
            hunit += 1
        print "* =", hunit, "items"

        # We need ndigits decimal digits to display the largest bucket count.
        ndigits = len(str(biggest))

        # Displaying the bucket boundaries is more troublesome.
        bucketwidth = self.get_bucketwidth()
        whole_digits = max(len(str(int(lo))),
                           len(str(int(hi - bucketwidth))))
        frac_digits = 0
        while bucketwidth < 1.0:
            # Incrementing by bucketwidth may not change the last displayed
            # digit, so display one more.
            frac_digits += 1
            bucketwidth *= 10.0
        format = ("%" + str(whole_digits + 1 + frac_digits) + '.' +
                  str(frac_digits) + 'f %' + str(ndigits) + "d")

        bucketwidth = self.get_bucketwidth()
        for i in range(nbuckets):
            n = self.buckets[i]
            print format % (lo + i * bucketwidth, n),
            print '*' * ((n + hunit - 1) // hunit)
syntax highlighted by Code2HTML, v. 0.9.1