from glob import glob
import re


class Histogram(object):
    """A simple class which is basically a dictionary
    of counts. inc increases counts for keys by one. The
    __str__ method is maybe a bit more complex than it needs
    to be because it plots the entries sorted in descending
    order by count."""
    def __init__(self):
        self.counts = dict()

    def inc(self, key):
        self.counts[key] = self.counts.get(key, 0) + 1

    def keys(self):
        return self.counts.keys()

    def __str__(self):
        out = []
        def cmp_freq(x, y):
            if self.counts[x] < self.counts[y]:
                return 1
            elif self.counts[x] > self.counts[y]:
                return -1
            else:
                return cmp(x, y)
        for word in sorted(self.counts.keys(), cmp_freq):
            out.append("%s: %d" % (word, self.counts[word]))
        return "\n".join(out)

    def __getitem__(self, w):
        return self.counts.get(w, 0)

######################################################################
#
# Your solution in this class!
#

# A DocumentAnalyzer
#
# This class collects word counts for a number of documents and also
# counts how often a word occurs in a document.
#
# count_words is called with file names and a class label for each document.
# count_words should call clean_word to remove punctuation from found
# words. Finally, write_as_matlab should write the results out to a file
# defining:
#
#   - words: a cell array of the words
#   - word_counts: an array where the word counts are stored. Rows
#       are documents, columns are ordered as in the "words" cell
#       array
#   - doc_counts: an array which lists in how many documents a word
#       occurs.
#   - doc_class: an array where the document classes are stored.
#
class DocumentAnalyzer(object):
    def clean_word(self, w):
        # ...
        
    def count_words(self, fn, cl):
        # ...
        
    def write_as_matlab(self, fn):
        # ...

######################################################################
# Main function below
#

counts = DocumentAnalyzer()
for fn in sorted(glob('data/money*')):
    print "analyzing " + fn
    counts.count_words(fn, 1)

for fn in sorted(glob('data/not-money*')):
    print "analyzing " + fn
    counts.count_words(fn, -1)

print "writing results"
counts.write_as_matlab('data.m')