VALUEIN - Currently only support ForestDocument,
but other types like Text or BytesWritable are possible candidates
to be added.public class ForestReader<VALUEIN> extends org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN> implements MarkLogicConstants
| Modifier and Type | Field and Description |
|---|---|
protected long |
bytesRead |
protected Collection<String> |
colFilters |
protected
org.apache.hadoop.conf.Configuration |
conf |
protected BiendianDataInputStream |
dataIs |
protected int |
deletedCnt |
protected Collection<String> |
dirFilters |
protected boolean |
done |
protected int |
fragCnt |
protected DocumentURIWithSourceInfo |
key |
protected
org.apache.hadoop.fs.Path |
largeForestDir |
static
org.apache.commons.logging.Log |
LOG |
protected int |
nascentCnt |
protected BiendianDataInputStream |
ordIs |
protected int |
position |
protected int |
prevDocid |
protected BiendianDataInputStream |
qualIs |
protected
org.apache.hadoop.mapreduce.lib.input.FileSplit |
split |
protected String |
srcId |
protected BiendianDataInputStream |
tsIs |
protected Collection<String> |
typeFilters |
protected VALUEIN |
value |
protected Class<? extends
org.apache.hadoop.io.Writable> |
valueClass |
ADVANCED_MODE,
ASSIGNMENT_POLICY,
BASIC_MODE,
BATCH_SIZE,
BIND_SPLIT_RANGE,
COLLECTION_FILTER,
CONTENT_TYPE,
COPY_COLLECTIONS,
COPY_METADATA,
COPY_QUALITY,
DEFAULT_BATCH_SIZE,
DEFAULT_CONTENT_TYPE,
DEFAULT_LOCAL_MAX_SPLIT_SIZE,
DEFAULT_MAX_SPLIT_SIZE,
DEFAULT_OUTPUT_CONTENT_ENCODING,
DEFAULT_OUTPUT_XML_REPAIR_LEVEL,
DEFAULT_PROPERTY_OPERATION_TYPE,
DEFAULT_TXN_SIZE,
DIRECTORY_FILTER,
DOCUMENT_SELECTOR,
EXECUTION_MODE,
EXTRACT_URI,
INDENTED,
INPUT_DATABASE_NAME,
INPUT_HOST,
INPUT_KEY_CLASS,
INPUT_LEXICON_FUNCTION_CLASS,
INPUT_MODE,
INPUT_PASSWORD,
INPUT_PORT,
INPUT_QUERY,
INPUT_QUERY_LANGUAGE,
INPUT_QUERY_TIMESTAMP,
INPUT_RESTRICT_HOSTS,
INPUT_SSL_OPTIONS_CLASS,
INPUT_SSL_PROTOCOL,
INPUT_USE_SSL,
INPUT_USERNAME,
INPUT_VALUE_CLASS,
MAX_SPLIT_SIZE,
MIN_NODEUPDATE_VERSION,
MODE_DISTRIBUTED,
MODE_LOCAL,
MR_NAMESPACE,
NODE_OPERATION_TYPE,
OUTPUT_CLEAN_DIR,
OUTPUT_COLLECTION,
OUTPUT_CONTENT_ENCODING,
OUTPUT_CONTENT_LANGUAGE,
OUTPUT_CONTENT_NAMESPACE,
OUTPUT_DATABASE_NAME,
OUTPUT_DIRECTORY,
OUTPUT_FAST_LOAD,
OUTPUT_FOREST_HOST,
OUTPUT_GRAPH,
OUTPUT_HOST,
OUTPUT_KEY_TYPE,
OUTPUT_KEY_VARNAME,
OUTPUT_NAMESPACE,
OUTPUT_OVERRIDE_GRAPH,
OUTPUT_PARTITION,
OUTPUT_PASSWORD,
OUTPUT_PERMISSION,
OUTPUT_PORT,
OUTPUT_PROPERTY_ALWAYS_CREATE,
OUTPUT_QUALITY,
OUTPUT_QUERY,
OUTPUT_QUERY_LANGUAGE,
OUTPUT_RESTRICT_HOSTS,
OUTPUT_SSL_OPTIONS_CLASS,
OUTPUT_SSL_PROTOCOL,
OUTPUT_STREAMING,
OUTPUT_URI_PREFIX,
OUTPUT_URI_REPLACE,
OUTPUT_URI_SUFFIX,
OUTPUT_USE_SSL,
OUTPUT_USERNAME,
OUTPUT_VALUE_TYPE,
OUTPUT_VALUE_VARNAME,
OUTPUT_XML_REPAIR_LEVEL,
PATH_NAMESPACE,
PROPERTY_OPERATION_TYPE,
QUERY_FILTER,
RECORD_TO_FRAGMENT_RATIO,
REDACTION_RULE_COLLECTION,
SPLIT_END_VARNAME,
SPLIT_QUERY,
SPLIT_START_VARNAME,
SUBDOCUMENT_EXPRESSION,
TEMPORAL_COLLECTION,
TXN_SIZE,
TYPE_FILTER| Constructor and Description |
|---|
ForestReader() |
| Modifier and Type | Method and Description |
|---|---|
protected boolean |
applyFilter(String uri,
ExpandedTree tree) |
void |
close() |
DocumentURIWithSourceInfo |
getCurrentKey() |
VALUEIN |
getCurrentValue() |
float |
getProgress() |
void |
initialize(org.apache.hadoop.mapreduce.InputSplit split,
org.apache.hadoop.mapreduce.TaskAttemptContext context) |
boolean |
nextKeyValue() |
protected void |
setKey(String uri,
String sub,
int line, int col)
Apply URI prefix and suffix configuration
options and set the result as DocumentURI key.
|
protected void |
setSkipKey(String sub,
int line, int col, String reason)
Set the result as DocumentURI key.
|
public static final org.apache.commons.logging.Log LOG
protected org.apache.hadoop.mapreduce.lib.input.FileSplit split
protected long bytesRead
protected org.apache.hadoop.conf.Configuration conf
protected BiendianDataInputStream dataIs
protected BiendianDataInputStream ordIs
protected BiendianDataInputStream tsIs
protected BiendianDataInputStream qualIs
protected DocumentURIWithSourceInfo key
protected VALUEIN value
protected Class<? extends org.apache.hadoop.io.Writable> valueClass
protected int position
protected int prevDocid
protected boolean done
protected org.apache.hadoop.fs.Path largeForestDir
protected int nascentCnt
protected int deletedCnt
protected int fragCnt
protected Collection<String> colFilters
protected Collection<String> dirFilters
protected Collection<String> typeFilters
protected String srcId
public void close()
throws IOException
close in
interface Closeableclose in
interface AutoCloseableclose in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionpublic DocumentURIWithSourceInfo getCurrentKey() throws IOException, InterruptedException
getCurrentKey in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionInterruptedExceptionpublic VALUEIN getCurrentValue() throws IOException, InterruptedException
getCurrentValue in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionInterruptedException
public float getProgress()
throws IOException,
InterruptedException
getProgress in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionInterruptedException
public void initialize(org.apache.hadoop.mapreduce.InputSplit split,
org.apache.hadoop.mapreduce.TaskAttemptContext context)
throws IOException,
InterruptedException
initialize in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionInterruptedException
public boolean nextKeyValue()
throws IOException,
InterruptedException
nextKeyValue in
class org.apache.hadoop.mapreduce.RecordReader<DocumentURIWithSourceInfo,VALUEIN>IOExceptionInterruptedExceptionprotected void setKey(String uri, String sub, int line, int col)
uri - Source string of document URI.sub - Sub-entry of the source of the document
origin.line - Line number in the source if applicable; -1
otherwise.col - Column number in the source if applicable;
-1 otherwise.protected void setSkipKey(String sub, int line, int col, String reason)
uri - Source string of document URI.line - Line number in the source if applicable; -1
otherwise.col - Column number in the source if applicable;
-1 otherwise.reason - Reason for skipping.protected boolean applyFilter(String uri, ExpandedTree tree)
Copyright © 2020 MarkLogic
Corporation. All Rights Reserved.
Complete online documentation for MarkLogic Server,
XQuery and related components may be found at
developer.marklogic.com