Attachment 'sheet09.m'
Download 1 function out = sheet09
2
3 data % load data prepared by sheet09.py
4
5 % compute word counts, tf-idf, and term frequencies
6 word_counts = word_counts;
7 tf = tf(word_counts);
8 tfidf = tf_idf(word_counts, doc_counts);
9
10 % plot similarities (scalar products) for all three
11 % features
12 figure(1)
13 S = similarities(word_counts);
14 plot_sim(S, doc_class);
15 title('similarities using the raw word_counts');
16
17 figure(2)
18 S = similarities(tf);
19 plot_sim(S, doc_class);
20 title('similarities using the term-frequencies');
21
22 figure(3)
23 S = similarities(tfidf);
24 plot_sim(S, doc_class);
25 title('similarities using the tf-idfs');
26 caxis([0, 1])
27
28 % collect word counts to compare classes
29 wc1 = collect_word_counts(word_counts, doc_class == 1);
30 wc2 = collect_word_counts(word_counts, doc_class == -1);
31
32 % first, we take term-frequencies
33 wc1 = tf(wc1);
34 wc2 = tf(wc2);
35
36 fprintf('Top 20 words for positive and negative class using\n');
37 fprintf('term frequencies\n');
38
39 fprintf('Top 20 positive words:\n');
40 show_top_words(20, wc1, words)
41 fprintf('Top 20 negative words:\n');
42 show_top_words(20, wc2, words)
43
44 % now, we weight the term frequencies by the invers document
45 % frequency
46 tfidf1 = tf_idf(wc1, doc_counts);
47 tfidf2 = tf_idf(wc2, doc_counts);
48
49 fprintf('Top 20 words for positive and negative class using\n');
50 fprintf('tf-idf scores\n');
51
52 fprintf('Top 20 positive words:\n');
53 show_top_words(20, tfidf1, words)
54 fprintf('Top 20 negative words:\n');
55 show_top_words(20, tfidf2, words)
56
57 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
58 % Your solution below!
59 %
60
61 % 1. compute term frequencies from word counts
62 function word_counts = tf(word_counts)
63 % ...
64
65 % 2. compute the TF-IDF statistic
66 function score = tf_idf(word_counts, doc_counts)
67 % ...
68
69 % 3. compute linear similarities (scalar products between
70 % all *rows* of matrix feats.
71 function S = similarities(feats)
72 % ...
73
74 % 4. plot similarities. Also plott boxes around the classes. You can
75 % assume that the class where doc_class == 1 come first.
76 function plot_sim(S, doc_class)
77 % ...
78
79 % 5. print the top n entries in each *row* of feats. Use the words
80 % cell-array to print the real names.
81 function show_top_words(n, feats, words)
82 % ...
83
84 % 6. from word_counts, sum rows in index I
85 function wc = collect_word_counts(word_counts, I)
86 % ...
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.