Examining common words in the summaries of the benetech dataset.

Ian Wong · Ian Wong · commit cc04c4904e24 · 2011-11-05T22:09:23.000-07:00
diff --git a/SF_2011/Benetech/R/common_summaries.R b/SF_2011/Benetech/R/common_summaries.R
@@ -0,0 +1,7 @@
+with_summaries <- subset(benetech, nchar(summary) > 0)
+with_english_summaries <- subset(with_summaries, language == "en")
+write.csv(with_english_summaries$summary, "with-summaries.csv", col.names = FALSE, row.names = FALSE, quote=FALSE)
+
+# then use
+# cat with-summaries.csv | tr " " "\n" | sort | uniq
+# then invoke common_words.py
diff --git a/SF_2011/Benetech/python/common_words.py b/SF_2011/Benetech/python/common_words.py
@@ -0,0 +1,23 @@
+import numpy
+
+word_to_count = {}
+for line in body:
+  count_and_word = line.lstrip().rstrip().split(" ")
+  count = count_and_word[0]
+  if len(count_and_word) == 2:
+    word = count_and_word[1]
+  else:
+    word = ""
+  word_to_count[word] = count
+
+word_to_count.values
+numpy.array(word_to_count.keys())[numpy.argsort(word_to_count.values())][-50:]
+# array(['formed', 'Thanpyuzayart', 'ya', 'fruit', '20', 'following',
+#        'Division', 'Time)', 'what', 'Operation', '2007', 'endure', 'been',
+#        'leader', 'most', 'demanded', 'participation', 'physical', 'arrest',
+#        'should', 'tried', 'Kaw', 'when', 'as', 'Nam', '2011', 'taking',
+#        'place', 'sent', 'Zaw', 'Tun', 'over', 'Namkham', 'Tin', 'health',
+#        'outside', 'I', 'While', 'Moe', 'Lay', 'prisons', 'Three',
+#        'further', 'reported', '(First', 'according', "don't", 'If', 'rice',
+#        'and'],
+#       dtype='|S78')