attachment:sheet08.m of Main/SS09_MaschinellesLernen2

   1 function out = sheet08_solution
   2 
   3 data % load data prepared by sheet08.py
   4 
   5 % compute word counts, tf-idf, and term frequencies
   6 word_counts = word_counts;
   7 tf = tf(word_counts);
   8 tfidf = tf_idf(word_counts, doc_counts);
   9 
  10 % plot similarities (scalar products) for all three
  11 % features
  12 figure(1)
  13 S = similarities(word_counts);
  14 plot_sim(S, doc_class);
  15 title('similarities using the raw word_counts');
  16 
  17 figure(2)
  18 S = similarities(tf);
  19 plot_sim(S, doc_class);
  20 title('similarities using the term-frequencies');
  21 
  22 figure(3)
  23 S = similarities(tfidf);
  24 plot_sim(S, doc_class);
  25 title('similarities using the tf-idfs');
  26 caxis([0, 1])
  27 
  28 % collect word counts to compare classes
  29 wc1 = collect_word_counts(word_counts, doc_class == 1);
  30 wc2 = collect_word_counts(word_counts, doc_class == -1);
  31 
  32 % first, we take term-frequencies
  33 wc1 = tf(wc1);
  34 wc2 = tf(wc2);
  35 
  36 fprintf('Top 20 words for positive and negative class using\n');
  37 fprintf('term frequencies\n');
  38 
  39 fprintf('Top 20 positive words:\n');
  40 show_top_words(20, wc1, words)
  41 fprintf('Top 20 negative words:\n');
  42 show_top_words(20, wc2, words)
  43 
  44 % now, we weight the term frequencies by the invers document
  45 % frequency
  46 tfidf1 = tf_idf(wc1, doc_counts);
  47 tfidf2 = tf_idf(wc2, doc_counts);
  48 
  49 fprintf('Top 20 words for positive and negative class using\n');
  50 fprintf('tf-idf scores\n');
  51 
  52 fprintf('Top 20 positive words:\n');
  53 show_top_words(20, tfidf1, words)
  54 fprintf('Top 20 negative words:\n');
  55 show_top_words(20, tfidf2, words)
  56 
  57 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  58 % Your solution below!
  59 %
  60 
  61 % 1. compute term frequencies from word counts
  62 function word_counts = tf(word_counts)
  63 % ...
  64 
  65 % 2. compute the TF-IDF statistic
  66 function score = tf_idf(word_counts, doc_counts)
  67 % ...
  68 
  69 % 3. compute linear similarities (scalar products between
  70 % all *rows* of matrix feats.
  71 function S = similarities(feats)
  72 % ...
  73 
  74 % 4. plot similarities. Also plott boxes around the classes. You can
  75 % assume that the class where doc_class == 1 come first.
  76 function plot_sim(S, doc_class)
  77 % ...
  78 
  79 % 5. print the top n entries in each *row* of feats. Use the words
  80 % cell-array to print the real names.
  81 function show_top_words(n, feats, words)
  82 % ...
  83 
  84 % 6. from word_counts, sum rows in index I
  85 function wc = collect_word_counts(word_counts, I)
  86 % ...
Attachment 'sheet08.m'

Attached Files