1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.access.nutch;
24
25 import java.io.IOException;
26
27 import org.apache.hadoop.io.Text;
28 import org.apache.hadoop.io.Writable;
29 import org.apache.hadoop.io.WritableComparable;
30 import org.apache.hadoop.mapred.OutputCollector;
31 import org.apache.hadoop.mapred.Reporter;
32
33 import org.apache.nutch.crawl.CrawlDbFilter;
34
35 /***
36 * Override so we can meddle with the key passed the superclass stripping
37 * collection (then, when the super's mapper is done, put the collection back.
38 * @author stack
39 */
40 public class NutchwaxCrawlDbFilter extends CrawlDbFilter {
41 public void map(final WritableComparable key, Writable value,
42 final OutputCollector output, Reporter r)
43 throws IOException {
44 final String collection = Nutchwax.getCollectionFromWaxKey(key);
45 final OutputCollector oo = new OutputCollector() {
46 public void collect(WritableComparable k, Writable v)
47 throws IOException {
48 output.collect(Nutchwax.generateWaxKey(k, collection), v);
49 }
50 };
51 super.map(new Text(Nutchwax.getUrlFromWaxKey(key)), value, oo, r);
52 }
53 }