org.archive.access.nutch
Class Nutchwax

java.lang.Object
  extended by org.archive.access.nutch.Nutchwax

public class Nutchwax
extends java.lang.Object

Script to run all indexing jobs from index through merge of final index.


Nested Class Summary
protected  class Nutchwax.OutputDirectories
           
 
Field Summary
static org.apache.commons.logging.Log LOG
           
 
Constructor Summary
Nutchwax()
          Default constructor.
 
Method Summary
protected  void checkArcsDir(org.apache.hadoop.fs.Path arcsDir)
          Check the arcs dir exists and looks like it has files that list ARCs (rather than ARCs themselves).
protected  boolean createLinkdb(Nutchwax.OutputDirectories od)
           
protected  void doAll(org.apache.hadoop.fs.Path input, java.lang.String collectionName, Nutchwax.OutputDirectories od)
          Run passed list of mapreduce indexing jobs.
static void doAllUsage(java.lang.String message, int exitCode)
           
protected  void doClass(java.lang.String[] args)
           
static void doClassUsage(java.lang.String message, int exitCode)
           
protected  void doDedup(Nutchwax.OutputDirectories od)
           
static void doDedupUsage(java.lang.String message, int exitCode)
           
protected  void doImport(org.apache.hadoop.fs.Path input, java.lang.String collectionName, Nutchwax.OutputDirectories od)
           
protected  void doIndexing(Nutchwax.OutputDirectories od)
           
protected  void doIndexing(Nutchwax.OutputDirectories od, org.apache.hadoop.fs.Path[] segments)
           
static void doIndexUsage(java.lang.String message, int exitCode)
           
protected  void doInvert(Nutchwax.OutputDirectories od)
           
protected  void doInvert(Nutchwax.OutputDirectories od, org.apache.hadoop.fs.Path[] segments)
           
static void doInvertUsage(java.lang.String message, int exitCode)
           
protected  void doJob(java.lang.String jobName, java.lang.String[] args)
           
protected  void doMerge(Nutchwax.OutputDirectories od)
           
static void doMergeUsage(java.lang.String message, int exitCode)
           
protected  void doSearch(java.lang.String[] args)
           
static void doSearchUsage(java.lang.String message, int exitCode)
           
protected  void doUpdate(Nutchwax.OutputDirectories od)
           
protected  void doUpdate(Nutchwax.OutputDirectories od, java.lang.String[] segments)
           
static void doUpdateUsage(java.lang.String message, int exitCode)
           
static org.apache.hadoop.io.Text generateWaxKey(java.lang.String keyStr, java.lang.String collection)
           
static org.apache.hadoop.io.Text generateWaxKey(org.apache.hadoop.io.WritableComparable key, java.lang.String collection)
           
static java.lang.String getCollectionFromWaxKey(org.apache.hadoop.io.WritableComparable key)
           
static long getDate(java.lang.String d)
           
 org.apache.hadoop.fs.FileSystem getFS()
           
 org.apache.hadoop.mapred.JobConf getJobConf()
           
protected  org.apache.hadoop.fs.Path[] getSegments(Nutchwax.OutputDirectories od)
           
static java.lang.String getUrlFromWaxKey(org.apache.hadoop.io.WritableComparable key)
           
static void main(java.lang.String[] args)
           
protected  java.lang.String[] rewriteArgs(java.lang.String[] args, int offset)
           
static void usage(java.lang.String message, int exitCode)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

LOG

public static final org.apache.commons.logging.Log LOG
Constructor Detail

Nutchwax

public Nutchwax()
         throws java.io.IOException
Default constructor.

Throws:
java.io.IOException
Method Detail

getJobConf

public org.apache.hadoop.mapred.JobConf getJobConf()

getFS

public org.apache.hadoop.fs.FileSystem getFS()
                                      throws java.io.IOException
Throws:
java.io.IOException

doAll

protected void doAll(org.apache.hadoop.fs.Path input,
                     java.lang.String collectionName,
                     Nutchwax.OutputDirectories od)
              throws java.lang.Exception
Run passed list of mapreduce indexing jobs. Jobs are always run in order: import, update, etc.

Throws:
java.lang.Exception

doImport

protected void doImport(org.apache.hadoop.fs.Path input,
                        java.lang.String collectionName,
                        Nutchwax.OutputDirectories od)
                 throws java.io.IOException
Throws:
java.io.IOException

doUpdate

protected void doUpdate(Nutchwax.OutputDirectories od)
                 throws java.io.IOException
Throws:
java.io.IOException

doUpdate

protected void doUpdate(Nutchwax.OutputDirectories od,
                        java.lang.String[] segments)
                 throws java.io.IOException
Throws:
java.io.IOException

getSegments

protected org.apache.hadoop.fs.Path[] getSegments(Nutchwax.OutputDirectories od)
                                           throws java.io.IOException
Throws:
java.io.IOException

doInvert

protected void doInvert(Nutchwax.OutputDirectories od,
                        org.apache.hadoop.fs.Path[] segments)
                 throws java.io.IOException
Throws:
java.io.IOException

doInvert

protected void doInvert(Nutchwax.OutputDirectories od)
                 throws java.io.IOException
Throws:
java.io.IOException

createLinkdb

protected boolean createLinkdb(Nutchwax.OutputDirectories od)
                        throws java.io.IOException
Throws:
java.io.IOException

doIndexing

protected void doIndexing(Nutchwax.OutputDirectories od)
                   throws java.io.IOException
Throws:
java.io.IOException

doIndexing

protected void doIndexing(Nutchwax.OutputDirectories od,
                          org.apache.hadoop.fs.Path[] segments)
                   throws java.io.IOException
Throws:
java.io.IOException

doDedup

protected void doDedup(Nutchwax.OutputDirectories od)
                throws java.io.IOException
Throws:
java.io.IOException

doMerge

protected void doMerge(Nutchwax.OutputDirectories od)
                throws java.io.IOException
Throws:
java.io.IOException

rewriteArgs

protected java.lang.String[] rewriteArgs(java.lang.String[] args,
                                         int offset)

doClass

protected void doClass(java.lang.String[] args)

doSearch

protected void doSearch(java.lang.String[] args)

doJob

protected void doJob(java.lang.String jobName,
                     java.lang.String[] args)
              throws java.lang.Exception
Throws:
java.lang.Exception

checkArcsDir

protected void checkArcsDir(org.apache.hadoop.fs.Path arcsDir)
                     throws java.io.IOException
Check the arcs dir exists and looks like it has files that list ARCs (rather than ARCs themselves).

Parameters:
arcsDir - Directory to examine.
Throws:
java.io.IOException

generateWaxKey

public static org.apache.hadoop.io.Text generateWaxKey(org.apache.hadoop.io.WritableComparable key,
                                                       java.lang.String collection)

generateWaxKey

public static org.apache.hadoop.io.Text generateWaxKey(java.lang.String keyStr,
                                                       java.lang.String collection)

getCollectionFromWaxKey

public static java.lang.String getCollectionFromWaxKey(org.apache.hadoop.io.WritableComparable key)
                                                throws java.io.IOException
Throws:
java.io.IOException

getUrlFromWaxKey

public static java.lang.String getUrlFromWaxKey(org.apache.hadoop.io.WritableComparable key)
                                         throws java.io.IOException
Throws:
java.io.IOException

getDate

public static long getDate(java.lang.String d)
                    throws java.io.IOException
Throws:
java.io.IOException

usage

public static void usage(java.lang.String message,
                         int exitCode)

doUpdateUsage

public static void doUpdateUsage(java.lang.String message,
                                 int exitCode)

doInvertUsage

public static void doInvertUsage(java.lang.String message,
                                 int exitCode)

doIndexUsage

public static void doIndexUsage(java.lang.String message,
                                int exitCode)

doDedupUsage

public static void doDedupUsage(java.lang.String message,
                                int exitCode)

doMergeUsage

public static void doMergeUsage(java.lang.String message,
                                int exitCode)

doSearchUsage

public static void doSearchUsage(java.lang.String message,
                                 int exitCode)

doAllUsage

public static void doAllUsage(java.lang.String message,
                              int exitCode)

doClassUsage

public static void doClassUsage(java.lang.String message,
                                int exitCode)

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception


Copyright © 2005-2007 Internet Archive. All Rights Reserved.