Attachment 'sheet09.py'

Download

   1 from glob import glob
   2 import re
   3 
   4 
   5 class Histogram(object):
   6     """A simple class which is basically a dictionary
   7     of counts. inc increases counts for keys by one. The
   8     __str__ method is maybe a bit more complex than it needs
   9     to be because it plots the entries sorted in descending
  10     order by count."""
  11     def __init__(self):
  12         self.counts = dict()
  13 
  14     def inc(self, key):
  15         self.counts[key] = self.counts.get(key, 0) + 1
  16 
  17     def keys(self):
  18         return self.counts.keys()
  19 
  20     def __str__(self):
  21         out = []
  22         def cmp_freq(x, y):
  23             if self.counts[x] < self.counts[y]:
  24                 return 1
  25             elif self.counts[x] > self.counts[y]:
  26                 return -1
  27             else:
  28                 return cmp(x, y)
  29         for word in sorted(self.counts.keys(), cmp_freq):
  30             out.append("%s: %d" % (word, self.counts[word]))
  31         return "\n".join(out)
  32 
  33     def __getitem__(self, w):
  34         return self.counts.get(w, 0)
  35 
  36 ######################################################################
  37 #
  38 # Your solution in this class!
  39 #
  40 
  41 # A DocumentAnalyzer
  42 #
  43 # This class collects word counts for a number of documents and also
  44 # counts how often a word occurs in a document.
  45 #
  46 # count_words is called with file names and a class label for each document.
  47 # count_words should call clean_word to remove punctuation from found
  48 # words. Finally, write_as_matlab should write the results out to a file
  49 # defining:
  50 #
  51 #   - words: a cell array of the words
  52 #   - word_counts: an array where the word counts are stored. Rows
  53 #       are documents, columns are ordered as in the "words" cell
  54 #       array
  55 #   - doc_counts: an array which lists in how many documents a word
  56 #       occurs.
  57 #   - doc_class: an array where the document classes are stored.
  58 #
  59 class DocumentAnalyzer(object):
  60     def clean_word(self, w):
  61         # ...
  62         
  63     def count_words(self, fn, cl):
  64         # ...
  65         
  66     def write_as_matlab(self, fn):
  67         # ...
  68 
  69 ######################################################################
  70 # Main function below
  71 #
  72 
  73 counts = DocumentAnalyzer()
  74 for fn in sorted(glob('data/money*')):
  75     print "analyzing " + fn
  76     counts.count_words(fn, 1)
  77 
  78 for fn in sorted(glob('data/not-money*')):
  79     print "analyzing " + fn
  80     counts.count_words(fn, -1)
  81 
  82 print "writing results"
  83 counts.write_as_matlab('data.m')

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2010-06-17 12:21:00, 2873.9 KB) [[attachment:bioinf.pdf]]
  • [get | view] (2010-05-04 11:11:44, 3591.3 KB) [[attachment:cca_lecture.pdf]]
  • [get | view] (2010-06-08 09:37:55, 209.6 KB) [[attachment:data.tar.gz]]
  • [get | view] (2010-04-19 09:59:41, 65.6 KB) [[attachment:full_sheet01.pdf]]
  • [get | view] (2010-04-20 09:18:53, 61.3 KB) [[attachment:full_sheet02.pdf]]
  • [get | view] (2010-04-27 09:42:10, 70.0 KB) [[attachment:full_sheet03.pdf]]
  • [get | view] (2010-05-04 10:48:12, 75.0 KB) [[attachment:full_sheet04.pdf]]
  • [get | view] (2010-05-11 08:22:55, 91.1 KB) [[attachment:full_sheet05.pdf]]
  • [get | view] (2010-05-18 10:01:06, 61.9 KB) [[attachment:full_sheet06.pdf]]
  • [get | view] (2010-05-27 10:02:14, 76.7 KB) [[attachment:full_sheet07.pdf]]
  • [get | view] (2010-06-01 08:38:57, 70.8 KB) [[attachment:full_sheet08.pdf]]
  • [get | view] (2010-06-08 09:37:48, 58.2 KB) [[attachment:full_sheet09.pdf]]
  • [get | view] (2010-06-15 10:05:24, 120.8 KB) [[attachment:full_sheet10.pdf]]
  • [get | view] (2010-06-22 08:07:29, 71.3 KB) [[attachment:full_sheet11.pdf]]
  • [get | view] (2010-06-29 09:14:44, 76.2 KB) [[attachment:full_sheet12.pdf]]
  • [get | view] (2010-07-06 10:08:39, 83.4 KB) [[attachment:full_sheet13.pdf]]
  • [get | view] (2010-06-01 08:40:12, 1391.7 KB) [[attachment:kld-tutorial.pdf]]
  • [get | view] (2010-05-27 06:38:11, 2850.3 KB) [[attachment:lect-ids.pdf]]
  • [get | view] (2010-05-20 13:07:56, 2099.2 KB) [[attachment:lect-struct.pdf]]
  • [get | view] (2010-04-20 09:19:25, 26591.3 KB) [[attachment:mnist_train.mat]]
  • [get | view] (2010-07-06 10:08:16, 192.5 KB) [[attachment:optim-intro.pdf]]
  • [get | view] (2010-04-20 09:19:00, 1.0 KB) [[attachment:sheet02.m]]
  • [get | view] (2010-05-11 08:23:01, 0.6 KB) [[attachment:sheet05.m]]
  • [get | view] (2010-05-27 10:02:41, 4.3 KB) [[attachment:sheet07.m]]
  • [get | view] (2010-06-01 08:39:07, 0.9 KB) [[attachment:sheet08.m]]
  • [get | view] (2010-06-08 09:38:00, 2.2 KB) [[attachment:sheet09.m]]
  • [get | view] (2010-06-08 09:38:06, 2.3 KB) [[attachment:sheet09.py]]
  • [get | view] (2010-06-22 08:07:55, 1.1 KB) [[attachment:sheet11.m]]
  • [get | view] (2010-06-22 08:07:51, 129.6 KB) [[attachment:splice-test-data.txt]]
  • [get | view] (2010-06-22 08:09:26, 5.4 KB) [[attachment:splice-test-label.txt]]
  • [get | view] (2010-06-22 08:07:41, 59.6 KB) [[attachment:splice-train-data.txt]]
  • [get | view] (2010-06-22 08:07:47, 2.5 KB) [[attachment:splice-train-label.txt]]
  • [get | view] (2010-04-27 08:49:35, 1515.8 KB) [[attachment:ssa_data.mat]]
  • [get | view] (2010-04-27 08:49:39, 585.7 KB) [[attachment:ssa_lecture.pdf]]
  • [get | view] (2010-04-27 08:49:50, 7.4 KB) [[attachment:ssa_simple.m]]
  • [get | view] (2010-05-27 06:34:10, 1217.5 KB) [[attachment:stud-data.mat.gz]]
  • [get | view] (2010-06-08 09:39:24, 1013.6 KB) [[attachment:textmining.pdf]]
  • [get | view] (2010-05-04 10:49:39, 1.0 KB) [[attachment:tkcca_example.m]]
  • [get | view] (2010-05-04 10:48:19, 4.1 KB) [[attachment:tkcca_simple.m]]
  • [get | view] (2010-05-04 10:48:24, 150.9 KB) [[attachment:tkcca_toy_data.mat]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.