Attachment 'sheet08.py'

Download

   1 from glob import glob
   2 import re
   3 
   4 
   5 class Histogram(object):
   6     """A simple class which is basically a dictionary
   7     of counts. inc increases counts for keys by one. The
   8     __str__ method is maybe a bit more complex than it needs
   9     to be because it plots the entries sorted in descending
  10     order by count."""
  11     def __init__(self):
  12         self.counts = dict()
  13 
  14     def inc(self, key):
  15         self.counts[key] = self.counts.get(key, 0) + 1
  16 
  17     def keys(self):
  18         return self.counts.keys()
  19 
  20     def __str__(self):
  21         out = []
  22         def cmp_freq(x, y):
  23             if self.counts[x] < self.counts[y]:
  24                 return 1
  25             elif self.counts[x] > self.counts[y]:
  26                 return -1
  27             else:
  28                 return cmp(x, y)
  29         for word in sorted(self.counts.keys(), cmp_freq):
  30             out.append("%s: %d" % (word, self.counts[word]))
  31         return "\n".join(out)
  32 
  33     def __getitem__(self, w):
  34         return self.counts.get(w, 0)
  35 
  36 ######################################################################
  37 #
  38 # Your solution in this class!
  39 #
  40 
  41 # A DocumentAnalyzer
  42 #
  43 # This class collects word counts for a number of documents and also
  44 # counts how often a word occurs in a document.
  45 #
  46 # count_words is called with file names and a class label for each document.
  47 # count_words should call clean_word to remove punctuation from found
  48 # words. Finally, write_as_matlab should write the results out to a file
  49 # defining:
  50 #
  51 #   - words: a cell array of the words
  52 #   - word_counts: an array where the word counts are stored. Rows
  53 #       are documents, columns are ordered as in the "words" cell
  54 #       array
  55 #   - doc_counts: an array which lists in how many documents a word
  56 #       occurs.
  57 #   - doc_class: an array where the document classes are stored.
  58 #
  59 class DocumentAnalyzer(object):
  60     def clean_word(self, w):
  61         # ...
  62         
  63     def count_words(self, fn, cl):
  64         # ...
  65         
  66     def write_as_matlab(self, fn):
  67         # ...
  68 
  69 ######################################################################
  70 # Main function below
  71 #
  72 
  73 counts = DocumentAnalyzer()
  74 for fn in sorted(glob('data/money*')):
  75     print "analyzing " + fn
  76     counts.count_words(fn, 1)
  77 
  78 for fn in sorted(glob('data/not-money*')):
  79     print "analyzing " + fn
  80     counts.count_words(fn, -1)
  81 
  82 print "writing results"
  83 counts.write_as_matlab('data.m')

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2009-06-16 13:02:17, 2873.9 KB) [[attachment:bioinf.pdf]]
  • [get | view] (2009-06-09 09:46:52, 209.6 KB) [[attachment:data.tar.gz]]
  • [get | view] (2009-05-26 09:12:28, 3099.4 KB) [[attachment:intrusion.pdf]]
  • [get | view] (2009-06-02 09:30:50, 1391.7 KB) [[attachment:kld-tutorial.pdf]]
  • [get | view] (2009-07-07 09:31:30, 1506.0 KB) [[attachment:largescale.pdf]]
  • [get | view] (2009-07-07 09:34:04, 192.5 KB) [[attachment:optim-intro.pdf]]
  • [get | view] (2009-05-05 08:16:16, 261.1 KB) [[attachment:recap2.pdf]]
  • [get | view] (2009-04-21 08:42:30, 66.5 KB) [[attachment:sheet01.pdf]]
  • [get | view] (2009-04-28 14:08:19, 1.0 KB) [[attachment:sheet02.m]]
  • [get | view] (2009-04-28 14:08:14, 62.8 KB) [[attachment:sheet02.pdf]]
  • [get | view] (2009-05-05 08:17:47, 0.6 KB) [[attachment:sheet03.m]]
  • [get | view] (2009-05-05 08:22:37, 90.9 KB) [[attachment:sheet03.pdf]]
  • [get | view] (2009-05-12 15:17:16, 1.9 KB) [[attachment:sheet04.m]]
  • [get | view] (2009-05-19 09:18:26, 110.6 KB) [[attachment:sheet04.pdf]]
  • [get | view] (2009-05-19 09:05:24, 63.6 KB) [[attachment:sheet05.pdf]]
  • [get | view] (2009-05-26 09:39:01, 4.4 KB) [[attachment:sheet06.m]]
  • [get | view] (2009-05-26 09:38:57, 84.0 KB) [[attachment:sheet06.pdf]]
  • [get | view] (2009-06-02 09:30:26, 0.9 KB) [[attachment:sheet07.m]]
  • [get | view] (2009-06-02 09:30:21, 72.2 KB) [[attachment:sheet07.pdf]]
  • [get | view] (2009-06-09 09:46:48, 2.2 KB) [[attachment:sheet08.m]]
  • [get | view] (2009-06-09 11:10:06, 60.3 KB) [[attachment:sheet08.pdf]]
  • [get | view] (2009-06-09 11:10:02, 2.3 KB) [[attachment:sheet08.py]]
  • [get | view] (2009-06-16 13:02:05, 122.2 KB) [[attachment:sheet09.pdf]]
  • [get | view] (2009-06-23 12:50:29, 1.1 KB) [[attachment:sheet10.m]]
  • [get | view] (2009-06-23 12:48:35, 77.4 KB) [[attachment:sheet10.pdf]]
  • [get | view] (2009-06-23 12:49:27, 5.8 KB) [[attachment:sheet10.tex]]
  • [get | view] (2009-07-07 13:55:57, 1.0 KB) [[attachment:sheet11.m]]
  • [get | view] (2009-07-07 13:58:20, 81.3 KB) [[attachment:sheet11.pdf]]
  • [get | view] (2009-06-23 12:48:40, 58.4 KB) [[attachment:splice.zip]]
  • [get | view] (2009-05-25 08:18:18, 591.1 KB) [[attachment:ssl2.pdf]]
  • [get | view] (2009-05-19 09:05:38, 614.6 KB) [[attachment:structured2.pdf]]
  • [get | view] (2009-05-26 09:39:05, 1217.5 KB) [[attachment:stud-data.mat.gz]]
  • [get | view] (2009-06-08 17:46:34, 1013.6 KB) [[attachment:textmining.pdf]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.