org.archive.access.nutch
Class Nutchwax
java.lang.Object
org.archive.access.nutch.Nutchwax
public class Nutchwax
- extends java.lang.Object
Script to run all indexing jobs from index through merge of final index.
|
Field Summary |
static org.apache.commons.logging.Log |
LOG
|
|
Constructor Summary |
Nutchwax()
Default constructor. |
|
Method Summary |
protected void |
checkArcsDir(org.apache.hadoop.fs.Path arcsDir)
Check the arcs dir exists and looks like it has files that list ARCs
(rather than ARCs themselves). |
protected boolean |
createLinkdb(Nutchwax.OutputDirectories od)
|
protected void |
doAll(org.apache.hadoop.fs.Path input,
java.lang.String collectionName,
Nutchwax.OutputDirectories od)
Run passed list of mapreduce indexing jobs. |
static void |
doAllUsage(java.lang.String message,
int exitCode)
|
protected void |
doClass(java.lang.String[] args)
|
static void |
doClassUsage(java.lang.String message,
int exitCode)
|
protected void |
doDedup(Nutchwax.OutputDirectories od)
|
static void |
doDedupUsage(java.lang.String message,
int exitCode)
|
protected void |
doImport(org.apache.hadoop.fs.Path input,
java.lang.String collectionName,
Nutchwax.OutputDirectories od)
|
protected void |
doIndexing(Nutchwax.OutputDirectories od)
|
protected void |
doIndexing(Nutchwax.OutputDirectories od,
org.apache.hadoop.fs.Path[] segments)
|
static void |
doIndexUsage(java.lang.String message,
int exitCode)
|
protected void |
doInvert(Nutchwax.OutputDirectories od)
|
protected void |
doInvert(Nutchwax.OutputDirectories od,
org.apache.hadoop.fs.Path[] segments)
|
static void |
doInvertUsage(java.lang.String message,
int exitCode)
|
protected void |
doJob(java.lang.String jobName,
java.lang.String[] args)
|
protected void |
doMerge(Nutchwax.OutputDirectories od)
|
static void |
doMergeUsage(java.lang.String message,
int exitCode)
|
protected void |
doSearch(java.lang.String[] args)
|
static void |
doSearchUsage(java.lang.String message,
int exitCode)
|
protected void |
doUpdate(Nutchwax.OutputDirectories od)
|
protected void |
doUpdate(Nutchwax.OutputDirectories od,
java.lang.String[] segments)
|
static void |
doUpdateUsage(java.lang.String message,
int exitCode)
|
static org.apache.hadoop.io.Text |
generateWaxKey(java.lang.String keyStr,
java.lang.String collection)
|
static org.apache.hadoop.io.Text |
generateWaxKey(org.apache.hadoop.io.WritableComparable key,
java.lang.String collection)
|
static java.lang.String |
getCollectionFromWaxKey(org.apache.hadoop.io.WritableComparable key)
|
static long |
getDate(java.lang.String d)
|
org.apache.hadoop.fs.FileSystem |
getFS()
|
org.apache.hadoop.mapred.JobConf |
getJobConf()
|
protected org.apache.hadoop.fs.Path[] |
getSegments(Nutchwax.OutputDirectories od)
|
static java.lang.String |
getUrlFromWaxKey(org.apache.hadoop.io.WritableComparable key)
|
static void |
main(java.lang.String[] args)
|
protected java.lang.String[] |
rewriteArgs(java.lang.String[] args,
int offset)
|
static void |
usage(java.lang.String message,
int exitCode)
|
| Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
LOG
public static final org.apache.commons.logging.Log LOG
Nutchwax
public Nutchwax()
throws java.io.IOException
- Default constructor.
- Throws:
java.io.IOException
getJobConf
public org.apache.hadoop.mapred.JobConf getJobConf()
getFS
public org.apache.hadoop.fs.FileSystem getFS()
throws java.io.IOException
- Throws:
java.io.IOException
doAll
protected void doAll(org.apache.hadoop.fs.Path input,
java.lang.String collectionName,
Nutchwax.OutputDirectories od)
throws java.lang.Exception
- Run passed list of mapreduce indexing jobs. Jobs are always run in
order: import, update, etc.
- Throws:
java.lang.Exception
doImport
protected void doImport(org.apache.hadoop.fs.Path input,
java.lang.String collectionName,
Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doUpdate
protected void doUpdate(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doUpdate
protected void doUpdate(Nutchwax.OutputDirectories od,
java.lang.String[] segments)
throws java.io.IOException
- Throws:
java.io.IOException
getSegments
protected org.apache.hadoop.fs.Path[] getSegments(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doInvert
protected void doInvert(Nutchwax.OutputDirectories od,
org.apache.hadoop.fs.Path[] segments)
throws java.io.IOException
- Throws:
java.io.IOException
doInvert
protected void doInvert(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
createLinkdb
protected boolean createLinkdb(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doIndexing
protected void doIndexing(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doIndexing
protected void doIndexing(Nutchwax.OutputDirectories od,
org.apache.hadoop.fs.Path[] segments)
throws java.io.IOException
- Throws:
java.io.IOException
doDedup
protected void doDedup(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
doMerge
protected void doMerge(Nutchwax.OutputDirectories od)
throws java.io.IOException
- Throws:
java.io.IOException
rewriteArgs
protected java.lang.String[] rewriteArgs(java.lang.String[] args,
int offset)
doClass
protected void doClass(java.lang.String[] args)
doSearch
protected void doSearch(java.lang.String[] args)
doJob
protected void doJob(java.lang.String jobName,
java.lang.String[] args)
throws java.lang.Exception
- Throws:
java.lang.Exception
checkArcsDir
protected void checkArcsDir(org.apache.hadoop.fs.Path arcsDir)
throws java.io.IOException
- Check the arcs dir exists and looks like it has files that list ARCs
(rather than ARCs themselves).
- Parameters:
arcsDir - Directory to examine.
- Throws:
java.io.IOException
generateWaxKey
public static org.apache.hadoop.io.Text generateWaxKey(org.apache.hadoop.io.WritableComparable key,
java.lang.String collection)
generateWaxKey
public static org.apache.hadoop.io.Text generateWaxKey(java.lang.String keyStr,
java.lang.String collection)
getCollectionFromWaxKey
public static java.lang.String getCollectionFromWaxKey(org.apache.hadoop.io.WritableComparable key)
throws java.io.IOException
- Throws:
java.io.IOException
getUrlFromWaxKey
public static java.lang.String getUrlFromWaxKey(org.apache.hadoop.io.WritableComparable key)
throws java.io.IOException
- Throws:
java.io.IOException
getDate
public static long getDate(java.lang.String d)
throws java.io.IOException
- Throws:
java.io.IOException
usage
public static void usage(java.lang.String message,
int exitCode)
doUpdateUsage
public static void doUpdateUsage(java.lang.String message,
int exitCode)
doInvertUsage
public static void doInvertUsage(java.lang.String message,
int exitCode)
doIndexUsage
public static void doIndexUsage(java.lang.String message,
int exitCode)
doDedupUsage
public static void doDedupUsage(java.lang.String message,
int exitCode)
doMergeUsage
public static void doMergeUsage(java.lang.String message,
int exitCode)
doSearchUsage
public static void doSearchUsage(java.lang.String message,
int exitCode)
doAllUsage
public static void doAllUsage(java.lang.String message,
int exitCode)
doClassUsage
public static void doClassUsage(java.lang.String message,
int exitCode)
main
public static void main(java.lang.String[] args)
throws java.lang.Exception
- Throws:
java.lang.Exception
Copyright © 2005-2007 Internet Archive. All Rights Reserved.