- enhance SurfaceFormDictionaryWordCount to also count word senses and

Johannes Simon · Johannes Simon · commit 82ab4ae48075 · 2015-01-07T11:33:01.000+01:00
list all word senses
diff --git a/src/main/java/de/tudarmstadt/lt/wiki/hadoop/SurfaceFormDictionaryWordCount.java b/src/main/java/de/tudarmstadt/lt/wiki/hadoop/SurfaceFormDictionaryWordCount.java
@@ -2,30 +2,47 @@
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
 public class SurfaceFormDictionaryWordCount extends Configured implements Tool {
-	private static class HadoopSurfaceFormDictionaryMap extends Mapper<LongWritable, Text, Text, IntWritable> {
+	private static class HadoopSurfaceFormDictionaryMap extends Mapper<LongWritable, Text, Text, Text> {
 		@Override
 		public void map(LongWritable key, Text value, Context context)
 			throws IOException, InterruptedException {
 			String word = value.toString().split("@@")[0];
+			String target = value.toString().split("@@")[1];
 			String count = value.toString().split("\t")[1];
 			
-			context.write(new Text(word), new IntWritable(Integer.parseInt(count)));
+			context.write(new Text(word), new Text(target + ":" + count));
+		}
+	}
+
+	private static class WikiSenseDictionaryReduce extends Reducer<Text, Text, Text, Text> {
+		@Override
+		public void reduce(Text word, Iterable<Text> targetCounts, Context context)
+			throws IOException, InterruptedException {
+			int numSenses = 0;
+			int totalCount = 0;
+			for (Text targetCount : targetCounts) {
+				int count = Integer.parseInt(targetCount.toString().split(":")[1]);
+				totalCount += count;
+				numSenses++;
+			}
+			
+			context.write(word, new Text(totalCount + "\t" + numSenses + "\t" + StringUtils.join(targetCounts.iterator(), "  ")));
 		}
 	}
 
@@ -45,11 +62,11 @@ public boolean runJob(String inDir, String outDir) throws Exception {
 		FileInputFormat.addInputPath(job, new Path(inDir));
 		FileOutputFormat.setOutputPath(job, new Path(_outDir));
 		job.setMapperClass(HadoopSurfaceFormDictionaryMap.class);
-		job.setReducerClass(IntSumReducer.class);
+		job.setReducerClass(WikiSenseDictionaryReduce.class);
 		job.setMapOutputKeyClass(Text.class);
-		job.setMapOutputValueClass(IntWritable.class);
+		job.setMapOutputValueClass(Text.class);
 		job.setOutputKeyClass(Text.class);
-		job.setOutputValueClass(IntWritable.class);
+		job.setOutputValueClass(Text.class);
 		job.setJobName("WikiLinkProcessor:SurfaceFormDictionaryWordCount");
 		return job.waitForCompletion(true);
 	}