1   package org.archive.access.nutch;
2   
3   import java.io.IOException;
4   
5   import javax.servlet.ServletContext;
6   
7   import org.apache.hadoop.conf.Configuration;
8   import org.apache.hadoop.fs.Path;
9   import org.apache.nutch.crawl.Inlinks;
10  import org.apache.nutch.searcher.HitDetails;
11  import org.apache.nutch.searcher.NutchBean;
12  import org.apache.nutch.searcher.Query;
13  import org.apache.nutch.searcher.Summary;
14  
15  /***
16   * Proxy that allows us intercept getSummary so we can change key used.
17   * @author stack
18   */
19  public class NutchwaxBean extends NutchBean {
20  	public NutchwaxBean(Configuration conf, Path dir) throws IOException {
21  		super(conf, dir);
22  	}
23  
24  	public NutchwaxBean(Configuration conf) throws IOException {
25  		super(conf);
26  	}
27  
28  	public static NutchBean get(ServletContext app, Configuration conf)
29  	throws IOException {
30  	    NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
31  	    if (bean == null) {
32  	      if (LOG.isInfoEnabled()) { LOG.info("creating new bean"); }
33  	      // Get the NutchwaxBean in there.
34  	      bean = new NutchwaxBean(conf);
35  	      app.setAttribute("nutchBean", bean);
36  	    }
37  	    return bean;
38  	}
39  	
40  	public Summary[] getSummary(HitDetails[] hits, Query query)
41  	throws IOException {
42  		// Rewrite details so that URL is not just URL when we go to get Summary.
43  		// Its compound of collection and url. Alternative is override of
44  		// NutchBean so we can add in our own Summarizer. NutchBean needs to be
45  		// made more amenable to subclassing. Should be setters for detailers,
46  		// etc. so can supply alternatives (Or pass in a constructor).
47  		HitDetails[] amendedHits = new HitDetails[hits.length];
48  		for (int j = 0; j < hits.length; j++) {
49  			HitDetails h = hits[j];
50  			amendedHits[j] = getCollectionQualifiedHitDetails(h);
51  		}
52  		return super.getSummary(amendedHits, query);
53  	}
54  	
55  	public String[] getAnchors(HitDetails h) throws IOException {
56  		return super.getAnchors(getCollectionQualifiedHitDetails(h));
57  	}
58  	
59  	public Inlinks getInlinks(HitDetails h) throws IOException {
60  		return super.getInlinks(getCollectionQualifiedHitDetails(h));
61  	}
62  	
63  	/***
64  	 * TODO: Make it so I don't have to create a new HitDetails changing
65  	 * the key used doing lookup.
66  	 * @param h
67  	 * @return
68  	 */
69  	protected HitDetails getCollectionQualifiedHitDetails(final HitDetails h) {
70  		return new HitDetails(h.getValue("segment"),
71  	        Nutchwax.generateWaxKey(h.getValue("url"),
72  	            h.getValue("collection")).toString());
73  	}
74  }