1   /* $Id: NutchwaxCrawlDbFilter.java 1390 2006-12-19 05:39:48Z stack-sf $
2    * 
3    * Created on December 18, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.access.nutch;
24  
25  import java.io.IOException;
26  
27  import org.apache.hadoop.io.Text;
28  import org.apache.hadoop.io.Writable;
29  import org.apache.hadoop.io.WritableComparable;
30  import org.apache.hadoop.mapred.OutputCollector;
31  import org.apache.hadoop.mapred.Reporter;
32  
33  import org.apache.nutch.crawl.CrawlDbFilter;
34  
35  /***
36   * Override so we can meddle with the key passed the superclass stripping
37   * collection (then, when the super's mapper is done, put the collection back.
38   * @author stack
39   */
40  public class NutchwaxCrawlDbFilter extends CrawlDbFilter {
41      public void map(final WritableComparable key, Writable value,
42              final OutputCollector output, Reporter r)
43      throws IOException {
44          final String collection = Nutchwax.getCollectionFromWaxKey(key);
45          final OutputCollector oo = new OutputCollector() {
46              public void collect(WritableComparable k, Writable v)
47              throws IOException {
48                  output.collect(Nutchwax.generateWaxKey(k, collection), v);
49              }
50          };
51          super.map(new Text(Nutchwax.getUrlFromWaxKey(key)), value, oo, r);
52      }
53   }