org.archive.access.nutch
Class ImportArcs

java.lang.Object
  extended by org.apache.hadoop.util.ToolBase
      extended by org.archive.access.nutch.ImportArcs
All Implemented Interfaces:
org.apache.hadoop.conf.Configurable, org.apache.hadoop.io.Closeable, org.apache.hadoop.mapred.JobConfigurable, org.apache.hadoop.mapred.Mapper, org.apache.hadoop.util.Tool, org.archive.mapred.ARCRecordMapper

public class ImportArcs
extends org.apache.hadoop.util.ToolBase
implements org.archive.mapred.ARCRecordMapper

Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat. FOF has five outputs:


Nested Class Summary
static class ImportArcs.WaxFetcherOutputFormat
          Override of nutch FetcherOutputFormat so I can substitute my own ParseOutputFormat, ImportArcs.WaxParseOutputFormat.
static class ImportArcs.WaxParseOutputFormat
          Copy so I can add collection prefix to produced signature and link CrawlDatums.
 
Field Summary
static java.lang.String ARCCOLLECTION_KEY
           
static java.lang.String ARCFILENAME_KEY
           
static java.lang.String ARCFILEOFFSET_KEY
           
 org.apache.commons.logging.Log LOG
           
static java.lang.String WAX_COLLECTION_KEY
           
 
Fields inherited from class org.apache.hadoop.util.ToolBase
conf
 
Constructor Summary
ImportArcs()
           
ImportArcs(org.apache.hadoop.conf.Configuration conf)
           
 
Method Summary
protected  boolean checkCollectionName()
           
protected static java.lang.String checkMimetype(java.lang.String mimetype)
           
 void close()
           
 void configure(org.apache.hadoop.mapred.JobConf job)
           
static void doImportUsage(java.lang.String message, int exitCode)
           
protected  java.lang.String formatToOneLine(java.lang.String s)
           
protected  java.lang.String getARCName(org.archive.io.arc.ARCRecordMetaData firstARCRecordMeta)
           
protected static java.lang.String getCollectionFromArcname(java.lang.String arcurl)
           
 org.apache.hadoop.conf.Configuration getConf()
           
protected  java.lang.String getMimetype(java.lang.String mimetype, org.apache.nutch.util.mime.MimeTypes mts, java.lang.String url)
           
protected  double getParseRate(long startTime, long len)
           
protected  java.lang.String getParseRateLogMessage(java.lang.String url, java.lang.String mimetype, double kbPerSecond)
           
protected  java.lang.String getStatus(java.lang.String url, java.lang.String oldUrl, java.lang.String recordLengthAsStr, java.lang.String noSpacesMimetype)
           
 void importArcs(org.apache.hadoop.fs.Path arcUrlsDir, org.apache.hadoop.fs.Path segment, java.lang.String collection)
           
protected  boolean isIndex(org.archive.io.arc.ARCRecord rec)
           
static void main(java.lang.String[] args)
           
 void map(org.apache.hadoop.io.WritableComparable key, org.apache.hadoop.io.Writable value, org.apache.hadoop.mapred.OutputCollector output, org.apache.hadoop.mapred.Reporter r)
           
 void onARCClose()
           
 void onARCOpen()
           
 int run(java.lang.String[] args)
           
 void setConf(org.apache.hadoop.conf.Configuration c)
           
protected  boolean skip(java.lang.String mimetype)
           
 
Methods inherited from class org.apache.hadoop.util.ToolBase
doMain
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

LOG

public final org.apache.commons.logging.Log LOG

ARCFILENAME_KEY

public static final java.lang.String ARCFILENAME_KEY
See Also:
Constant Field Values

ARCFILEOFFSET_KEY

public static final java.lang.String ARCFILEOFFSET_KEY
See Also:
Constant Field Values

ARCCOLLECTION_KEY

public static final java.lang.String ARCCOLLECTION_KEY
See Also:
Constant Field Values

WAX_COLLECTION_KEY

public static final java.lang.String WAX_COLLECTION_KEY
See Also:
Constant Field Values
Constructor Detail

ImportArcs

public ImportArcs()

ImportArcs

public ImportArcs(org.apache.hadoop.conf.Configuration conf)
Method Detail

importArcs

public void importArcs(org.apache.hadoop.fs.Path arcUrlsDir,
                       org.apache.hadoop.fs.Path segment,
                       java.lang.String collection)
                throws java.io.IOException
Throws:
java.io.IOException

configure

public void configure(org.apache.hadoop.mapred.JobConf job)
Specified by:
configure in interface org.apache.hadoop.mapred.JobConfigurable

getConf

public org.apache.hadoop.conf.Configuration getConf()
Specified by:
getConf in interface org.apache.hadoop.conf.Configurable
Overrides:
getConf in class org.apache.hadoop.util.ToolBase

setConf

public void setConf(org.apache.hadoop.conf.Configuration c)
Specified by:
setConf in interface org.apache.hadoop.conf.Configurable
Overrides:
setConf in class org.apache.hadoop.util.ToolBase

onARCOpen

public void onARCOpen()
Specified by:
onARCOpen in interface org.archive.mapred.ARCRecordMapper

onARCClose

public void onARCClose()
Specified by:
onARCClose in interface org.archive.mapred.ARCRecordMapper

map

public void map(org.apache.hadoop.io.WritableComparable key,
                org.apache.hadoop.io.Writable value,
                org.apache.hadoop.mapred.OutputCollector output,
                org.apache.hadoop.mapred.Reporter r)
         throws java.io.IOException
Specified by:
map in interface org.apache.hadoop.mapred.Mapper
Throws:
java.io.IOException

checkCollectionName

protected boolean checkCollectionName()

getCollectionFromArcname

protected static java.lang.String getCollectionFromArcname(java.lang.String arcurl)
                                                    throws java.net.URISyntaxException
Throws:
java.net.URISyntaxException

isIndex

protected boolean isIndex(org.archive.io.arc.ARCRecord rec)
Parameters:
rec - ARC Record to test.
Returns:
True if we are to index this record.

getARCName

protected java.lang.String getARCName(org.archive.io.arc.ARCRecordMetaData firstARCRecordMeta)
Parameters:
firstARCRecordMeta - The metadata record of the first record in an ARC (the filedesc record).
Returns:
Trimmed ARCName stripped of path preamble/prefix and suffix (At least WERA expects an ARC name without scheme and suffix: i.e. IAH-20060315203614-00000-debord).
Throws:
java.lang.NullPointerException - If unable to find an ARC name.

getStatus

protected java.lang.String getStatus(java.lang.String url,
                                     java.lang.String oldUrl,
                                     java.lang.String recordLengthAsStr,
                                     java.lang.String noSpacesMimetype)

formatToOneLine

protected java.lang.String formatToOneLine(java.lang.String s)

getParseRateLogMessage

protected java.lang.String getParseRateLogMessage(java.lang.String url,
                                                  java.lang.String mimetype,
                                                  double kbPerSecond)

getParseRate

protected double getParseRate(long startTime,
                              long len)

skip

protected boolean skip(java.lang.String mimetype)

getMimetype

protected java.lang.String getMimetype(java.lang.String mimetype,
                                       org.apache.nutch.util.mime.MimeTypes mts,
                                       java.lang.String url)

checkMimetype

protected static java.lang.String checkMimetype(java.lang.String mimetype)

close

public void close()
Specified by:
close in interface org.apache.hadoop.io.Closeable

doImportUsage

public static void doImportUsage(java.lang.String message,
                                 int exitCode)

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception

run

public int run(java.lang.String[] args)
        throws java.lang.Exception
Specified by:
run in interface org.apache.hadoop.util.Tool
Throws:
java.lang.Exception


Copyright © 2005-2007 Internet Archive. All Rights Reserved.