#!/usr/bin/env python
"""Word frequencies - count word frequencies in a string."""

def word_freq(text):
    """Return a dictionary of word frequencies for the given text."""

    freqs = {}
    for word in text.split():
        freqs.setdefault(word,0)
        freqs[word] += 1
    return freqs

def print_vk(lst):
    """Print a list of value/key pairs nicely formatted in key/value order."""

    # Find the longest key: remember, the list has value/key paris, so the key
    # is element [1], not [0]
    longest_key = max(map(lambda x: len(x[1]),lst))
    # Make a format string out of it
    fmt = '%'+str(longest_key)+'s -> %s'
    # Do actual printing
    for v,k in lst:
        print fmt % (k,v)

def freq_summ(freqs,n=10):
    """Print a simple summary of a word frequencies dictionary.

    Inputs:
      - freqs: a dictionary of word frequencies.

    Optional inputs:
      - n: the number of """

    words,counts = freqs.keys(),freqs.values()
    # Sort by count
    items = zip(counts,words)
    items.sort()

    print 'Number of words:',len(freqs)
    print
    print '%d least frequent words:' % n
    print_vk(items[:n])
    print
    print '%d most frequent words:' % n
    print_vk(items[-n:])

if __name__ == '__main__':
    import gzip
    text = gzip.open('data/HISTORY.gz').read()
    freqs = word_freq(text)
    freq_summ(freqs,20)

