Skip to content

Commit 82ab4ae

Browse files
author
Johannes Simon
committed
- enhance SurfaceFormDictionaryWordCount to also count word senses and
list all word senses
1 parent 153451a commit 82ab4ae

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

src/main/java/de/tudarmstadt/lt/wiki/hadoop/SurfaceFormDictionaryWordCount.java

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,47 @@
22
import java.io.IOException;
33
import java.util.Arrays;
44

5+
import org.apache.commons.lang.StringUtils;
56
import org.apache.hadoop.conf.Configuration;
67
import org.apache.hadoop.conf.Configured;
78
import org.apache.hadoop.fs.FileSystem;
89
import org.apache.hadoop.fs.Path;
9-
import org.apache.hadoop.io.IntWritable;
1010
import org.apache.hadoop.io.LongWritable;
1111
import org.apache.hadoop.io.Text;
1212
import org.apache.hadoop.mapreduce.Job;
1313
import org.apache.hadoop.mapreduce.Mapper;
14+
import org.apache.hadoop.mapreduce.Reducer;
1415
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
1516
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16-
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
1717
import org.apache.hadoop.util.Tool;
1818
import org.apache.hadoop.util.ToolRunner;
1919

2020
public class SurfaceFormDictionaryWordCount extends Configured implements Tool {
21-
private static class HadoopSurfaceFormDictionaryMap extends Mapper<LongWritable, Text, Text, IntWritable> {
21+
private static class HadoopSurfaceFormDictionaryMap extends Mapper<LongWritable, Text, Text, Text> {
2222
@Override
2323
public void map(LongWritable key, Text value, Context context)
2424
throws IOException, InterruptedException {
2525
String word = value.toString().split("@@")[0];
26+
String target = value.toString().split("@@")[1];
2627
String count = value.toString().split("\t")[1];
2728

28-
context.write(new Text(word), new IntWritable(Integer.parseInt(count)));
29+
context.write(new Text(word), new Text(target + ":" + count));
30+
}
31+
}
32+
33+
private static class WikiSenseDictionaryReduce extends Reducer<Text, Text, Text, Text> {
34+
@Override
35+
public void reduce(Text word, Iterable<Text> targetCounts, Context context)
36+
throws IOException, InterruptedException {
37+
int numSenses = 0;
38+
int totalCount = 0;
39+
for (Text targetCount : targetCounts) {
40+
int count = Integer.parseInt(targetCount.toString().split(":")[1]);
41+
totalCount += count;
42+
numSenses++;
43+
}
44+
45+
context.write(word, new Text(totalCount + "\t" + numSenses + "\t" + StringUtils.join(targetCounts.iterator(), " ")));
2946
}
3047
}
3148

@@ -45,11 +62,11 @@ public boolean runJob(String inDir, String outDir) throws Exception {
4562
FileInputFormat.addInputPath(job, new Path(inDir));
4663
FileOutputFormat.setOutputPath(job, new Path(_outDir));
4764
job.setMapperClass(HadoopSurfaceFormDictionaryMap.class);
48-
job.setReducerClass(IntSumReducer.class);
65+
job.setReducerClass(WikiSenseDictionaryReduce.class);
4966
job.setMapOutputKeyClass(Text.class);
50-
job.setMapOutputValueClass(IntWritable.class);
67+
job.setMapOutputValueClass(Text.class);
5168
job.setOutputKeyClass(Text.class);
52-
job.setOutputValueClass(IntWritable.class);
69+
job.setOutputValueClass(Text.class);
5370
job.setJobName("WikiLinkProcessor:SurfaceFormDictionaryWordCount");
5471
return job.waitForCompletion(true);
5572
}

0 commit comments

Comments
 (0)