Attachment 'sheet08.py'
Download 1 from glob import glob
2 import re
3
4
5 class Histogram(object):
6 """A simple class which is basically a dictionary
7 of counts. inc increases counts for keys by one. The
8 __str__ method is maybe a bit more complex than it needs
9 to be because it plots the entries sorted in descending
10 order by count."""
11 def __init__(self):
12 self.counts = dict()
13
14 def inc(self, key):
15 self.counts[key] = self.counts.get(key, 0) + 1
16
17 def keys(self):
18 return self.counts.keys()
19
20 def __str__(self):
21 out = []
22 def cmp_freq(x, y):
23 if self.counts[x] < self.counts[y]:
24 return 1
25 elif self.counts[x] > self.counts[y]:
26 return -1
27 else:
28 return cmp(x, y)
29 for word in sorted(self.counts.keys(), cmp_freq):
30 out.append("%s: %d" % (word, self.counts[word]))
31 return "\n".join(out)
32
33 def __getitem__(self, w):
34 return self.counts.get(w, 0)
35
36 ######################################################################
37 #
38 # Your solution in this class!
39 #
40
41 # A DocumentAnalyzer
42 #
43 # This class collects word counts for a number of documents and also
44 # counts how often a word occurs in a document.
45 #
46 # count_words is called with file names and a class label for each document.
47 # count_words should call clean_word to remove punctuation from found
48 # words. Finally, write_as_matlab should write the results out to a file
49 # defining:
50 #
51 # - words: a cell array of the words
52 # - word_counts: an array where the word counts are stored. Rows
53 # are documents, columns are ordered as in the "words" cell
54 # array
55 # - doc_counts: an array which lists in how many documents a word
56 # occurs.
57 # - doc_class: an array where the document classes are stored.
58 #
59 class DocumentAnalyzer(object):
60 def clean_word(self, w):
61 # ...
62
63 def count_words(self, fn, cl):
64 # ...
65
66 def write_as_matlab(self, fn):
67 # ...
68
69 ######################################################################
70 # Main function below
71 #
72
73 counts = DocumentAnalyzer()
74 for fn in sorted(glob('data/money*')):
75 print "analyzing " + fn
76 counts.count_words(fn, 1)
77
78 for fn in sorted(glob('data/not-money*')):
79 print "analyzing " + fn
80 counts.count_words(fn, -1)
81
82 print "writing results"
83 counts.write_as_matlab('data.m')
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.