org.archive.access.nutch
Class ImportArcs
java.lang.Object
org.apache.hadoop.util.ToolBase
org.archive.access.nutch.ImportArcs
- All Implemented Interfaces:
- org.apache.hadoop.conf.Configurable, org.apache.hadoop.io.Closeable, org.apache.hadoop.mapred.JobConfigurable, org.apache.hadoop.mapred.Mapper, org.apache.hadoop.util.Tool, org.archive.mapred.ARCRecordMapper
public class ImportArcs
- extends org.apache.hadoop.util.ToolBase
- implements org.archive.mapred.ARCRecordMapper
Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat.
FOF has five outputs:
| Fields inherited from class org.apache.hadoop.util.ToolBase |
conf |
|
Method Summary |
protected boolean |
checkCollectionName()
|
protected static java.lang.String |
checkMimetype(java.lang.String mimetype)
|
void |
close()
|
void |
configure(org.apache.hadoop.mapred.JobConf job)
|
static void |
doImportUsage(java.lang.String message,
int exitCode)
|
protected java.lang.String |
formatToOneLine(java.lang.String s)
|
protected java.lang.String |
getARCName(org.archive.io.arc.ARCRecordMetaData firstARCRecordMeta)
|
protected static java.lang.String |
getCollectionFromArcname(java.lang.String arcurl)
|
org.apache.hadoop.conf.Configuration |
getConf()
|
protected java.lang.String |
getMimetype(java.lang.String mimetype,
org.apache.nutch.util.mime.MimeTypes mts,
java.lang.String url)
|
protected double |
getParseRate(long startTime,
long len)
|
protected java.lang.String |
getParseRateLogMessage(java.lang.String url,
java.lang.String mimetype,
double kbPerSecond)
|
protected java.lang.String |
getStatus(java.lang.String url,
java.lang.String oldUrl,
java.lang.String recordLengthAsStr,
java.lang.String noSpacesMimetype)
|
void |
importArcs(org.apache.hadoop.fs.Path arcUrlsDir,
org.apache.hadoop.fs.Path segment,
java.lang.String collection)
|
protected boolean |
isIndex(org.archive.io.arc.ARCRecord rec)
|
static void |
main(java.lang.String[] args)
|
void |
map(org.apache.hadoop.io.WritableComparable key,
org.apache.hadoop.io.Writable value,
org.apache.hadoop.mapred.OutputCollector output,
org.apache.hadoop.mapred.Reporter r)
|
void |
onARCClose()
|
void |
onARCOpen()
|
int |
run(java.lang.String[] args)
|
void |
setConf(org.apache.hadoop.conf.Configuration c)
|
protected boolean |
skip(java.lang.String mimetype)
|
| Methods inherited from class org.apache.hadoop.util.ToolBase |
doMain |
| Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
LOG
public final org.apache.commons.logging.Log LOG
ARCFILENAME_KEY
public static final java.lang.String ARCFILENAME_KEY
- See Also:
- Constant Field Values
ARCFILEOFFSET_KEY
public static final java.lang.String ARCFILEOFFSET_KEY
- See Also:
- Constant Field Values
ARCCOLLECTION_KEY
public static final java.lang.String ARCCOLLECTION_KEY
- See Also:
- Constant Field Values
WAX_COLLECTION_KEY
public static final java.lang.String WAX_COLLECTION_KEY
- See Also:
- Constant Field Values
ImportArcs
public ImportArcs()
ImportArcs
public ImportArcs(org.apache.hadoop.conf.Configuration conf)
importArcs
public void importArcs(org.apache.hadoop.fs.Path arcUrlsDir,
org.apache.hadoop.fs.Path segment,
java.lang.String collection)
throws java.io.IOException
- Throws:
java.io.IOException
configure
public void configure(org.apache.hadoop.mapred.JobConf job)
- Specified by:
configure in interface org.apache.hadoop.mapred.JobConfigurable
getConf
public org.apache.hadoop.conf.Configuration getConf()
- Specified by:
getConf in interface org.apache.hadoop.conf.Configurable- Overrides:
getConf in class org.apache.hadoop.util.ToolBase
setConf
public void setConf(org.apache.hadoop.conf.Configuration c)
- Specified by:
setConf in interface org.apache.hadoop.conf.Configurable- Overrides:
setConf in class org.apache.hadoop.util.ToolBase
onARCOpen
public void onARCOpen()
- Specified by:
onARCOpen in interface org.archive.mapred.ARCRecordMapper
onARCClose
public void onARCClose()
- Specified by:
onARCClose in interface org.archive.mapred.ARCRecordMapper
map
public void map(org.apache.hadoop.io.WritableComparable key,
org.apache.hadoop.io.Writable value,
org.apache.hadoop.mapred.OutputCollector output,
org.apache.hadoop.mapred.Reporter r)
throws java.io.IOException
- Specified by:
map in interface org.apache.hadoop.mapred.Mapper
- Throws:
java.io.IOException
checkCollectionName
protected boolean checkCollectionName()
getCollectionFromArcname
protected static java.lang.String getCollectionFromArcname(java.lang.String arcurl)
throws java.net.URISyntaxException
- Throws:
java.net.URISyntaxException
isIndex
protected boolean isIndex(org.archive.io.arc.ARCRecord rec)
- Parameters:
rec - ARC Record to test.
- Returns:
- True if we are to index this record.
getARCName
protected java.lang.String getARCName(org.archive.io.arc.ARCRecordMetaData firstARCRecordMeta)
- Parameters:
firstARCRecordMeta - The metadata record of the first record in an
ARC (the filedesc record).
- Returns:
- Trimmed ARCName stripped of path preamble/prefix and suffix
(At least WERA expects an ARC name without scheme and suffix: i.e.
IAH-20060315203614-00000-debord).
- Throws:
java.lang.NullPointerException - If unable to find an ARC name.
getStatus
protected java.lang.String getStatus(java.lang.String url,
java.lang.String oldUrl,
java.lang.String recordLengthAsStr,
java.lang.String noSpacesMimetype)
formatToOneLine
protected java.lang.String formatToOneLine(java.lang.String s)
getParseRateLogMessage
protected java.lang.String getParseRateLogMessage(java.lang.String url,
java.lang.String mimetype,
double kbPerSecond)
getParseRate
protected double getParseRate(long startTime,
long len)
skip
protected boolean skip(java.lang.String mimetype)
getMimetype
protected java.lang.String getMimetype(java.lang.String mimetype,
org.apache.nutch.util.mime.MimeTypes mts,
java.lang.String url)
checkMimetype
protected static java.lang.String checkMimetype(java.lang.String mimetype)
close
public void close()
- Specified by:
close in interface org.apache.hadoop.io.Closeable
doImportUsage
public static void doImportUsage(java.lang.String message,
int exitCode)
main
public static void main(java.lang.String[] args)
throws java.lang.Exception
- Throws:
java.lang.Exception
run
public int run(java.lang.String[] args)
throws java.lang.Exception
- Specified by:
run in interface org.apache.hadoop.util.Tool
- Throws:
java.lang.Exception
Copyright © 2005-2007 Internet Archive. All Rights Reserved.