@Title(value="ARFF File Format Parser") public class ArffParser extends java.lang.Object implements Parser
This parser is quite hackish, and contains lots of not yet configurable magic.
TODO: Allow configuration of the vector types (double, float)
TODO: when encountering integer columns, produce integer vectors.
TODO: allow optional class labels.
| Modifier and Type | Class and Description |
|---|---|
static class |
ArffParser.Parameterizer
Parameterization class.
|
| Modifier and Type | Field and Description |
|---|---|
static java.util.regex.Matcher |
ARFF_COMMENT
Comment pattern.
|
static java.util.regex.Matcher |
ARFF_HEADER_ATTRIBUTE
Arff attribute declaration marker.
|
static java.util.regex.Matcher |
ARFF_HEADER_DATA
Arff data marker.
|
static java.util.regex.Matcher |
ARFF_HEADER_RELATION
Arff file marker.
|
static java.util.regex.Matcher |
ARFF_NUMERIC
Pattern for numeric columns.
|
static java.lang.String |
DEFAULT_ARFF_MAGIC_CLASS
Pattern to auto-convert columns to class labels.
|
static java.lang.String |
DEFAULT_ARFF_MAGIC_EID
Pattern to auto-convert columns to external ids.
|
(package private) NumberVector.Factory<?> |
denseFactory
Factory for dense vectors.
|
static java.util.regex.Matcher |
EMPTY
Empty line pattern.
|
(package private) java.util.ArrayList<java.lang.String> |
labels
(Reused) buffer for building label lists.
|
private static Logging |
LOG
Logger.
|
(package private) java.util.regex.Matcher |
magic_class
Pattern to recognize class label columns.
|
(package private) java.util.regex.Matcher |
magic_eid
Pattern to recognize external ids.
|
| Constructor and Description |
|---|
ArffParser(java.util.regex.Pattern magic_eid,
java.util.regex.Pattern magic_class)
Constructor.
|
ArffParser(java.lang.String magic_eid,
java.lang.String magic_class)
Constructor.
|
| Modifier and Type | Method and Description |
|---|---|
void |
cleanup()
Perform cleanup operations after parsing.
|
private java.lang.Object[] |
loadDenseInstance(java.io.StreamTokenizer tokenizer,
int[] dimsize,
TypeInformation[] etyp,
int outdim) |
private java.lang.Object[] |
loadSparseInstance(java.io.StreamTokenizer tokenizer,
int[] targ,
int[] dimsize,
TypeInformation[] elkitypes,
int metaLength) |
private java.io.StreamTokenizer |
makeArffTokenizer(java.io.BufferedReader br)
Make a StreamTokenizer for the ARFF format.
|
private void |
nextToken(java.io.StreamTokenizer tokenizer)
Helper function for token handling.
|
MultipleObjectsBundle |
parse(java.io.InputStream instream)
Returns a list of the objects parsed from the specified input stream.
|
private void |
parseAttributeStatements(java.io.BufferedReader br,
java.util.ArrayList<java.lang.String> names,
java.util.ArrayList<java.lang.String> types)
Parse the "@attribute" section of the ARFF file.
|
private void |
processColumnTypes(java.util.ArrayList<java.lang.String> names,
java.util.ArrayList<java.lang.String> types,
int[] targ,
TypeInformation[] etyp,
int[] dims)
Process the column types (and names!)
|
private void |
readHeader(java.io.BufferedReader br)
Read the dataset header part of the ARFF file, to ensure consistency.
|
private void |
setupBundleHeaders(java.util.ArrayList<java.lang.String> names,
int[] targ,
TypeInformation[] etyp,
int[] dimsize,
MultipleObjectsBundle bundle,
boolean sparse)
Setup the headers for the object bundle.
|
private static final Logging LOG
public static final java.util.regex.Matcher ARFF_HEADER_RELATION
public static final java.util.regex.Matcher ARFF_HEADER_ATTRIBUTE
public static final java.util.regex.Matcher ARFF_HEADER_DATA
public static final java.util.regex.Matcher ARFF_COMMENT
public static final java.lang.String DEFAULT_ARFF_MAGIC_EID
public static final java.lang.String DEFAULT_ARFF_MAGIC_CLASS
public static final java.util.regex.Matcher ARFF_NUMERIC
public static final java.util.regex.Matcher EMPTY
java.util.regex.Matcher magic_eid
java.util.regex.Matcher magic_class
java.util.ArrayList<java.lang.String> labels
NumberVector.Factory<?> denseFactory
public ArffParser(java.util.regex.Pattern magic_eid,
java.util.regex.Pattern magic_class)
magic_eid - Magic to recognize external IDsmagic_class - Magic to recognize class labelspublic ArffParser(java.lang.String magic_eid,
java.lang.String magic_class)
magic_eid - Magic to recognize external IDsmagic_class - Magic to recognize class labelspublic MultipleObjectsBundle parse(java.io.InputStream instream)
Parserprivate java.lang.Object[] loadSparseInstance(java.io.StreamTokenizer tokenizer,
int[] targ,
int[] dimsize,
TypeInformation[] elkitypes,
int metaLength)
throws java.io.IOException
java.io.IOExceptionprivate java.lang.Object[] loadDenseInstance(java.io.StreamTokenizer tokenizer,
int[] dimsize,
TypeInformation[] etyp,
int outdim)
throws java.io.IOException
java.io.IOExceptionprivate java.io.StreamTokenizer makeArffTokenizer(java.io.BufferedReader br)
br - Buffered readerprivate void setupBundleHeaders(java.util.ArrayList<java.lang.String> names,
int[] targ,
TypeInformation[] etyp,
int[] dimsize,
MultipleObjectsBundle bundle,
boolean sparse)
names - Attribute namestarg - Target columnsetyp - ELKI type informationdimsize - Number of dimensions in the individual typesbundle - Output bundlesparse - Flag to create sparse vectorsprivate void readHeader(java.io.BufferedReader br)
throws java.io.IOException
br - Buffered Readerjava.io.IOExceptionprivate void parseAttributeStatements(java.io.BufferedReader br,
java.util.ArrayList<java.lang.String> names,
java.util.ArrayList<java.lang.String> types)
throws java.io.IOException
br - Inputnames - List (to fill) of attribute namestypes - List (to fill) of attribute typesjava.io.IOExceptionprivate void processColumnTypes(java.util.ArrayList<java.lang.String> names,
java.util.ArrayList<java.lang.String> types,
int[] targ,
TypeInformation[] etyp,
int[] dims)
names - Attribute namestypes - Attribute typestarg - Target dimension mapping (ARFF to ELKI), return valueetyp - ELKI type information, return valuedims - Number of successive dimensions, return valueprivate void nextToken(java.io.StreamTokenizer tokenizer)
throws java.io.IOException
tokenizer - Tokenizerjava.io.IOExceptionCopyright © 2019 ELKI Development Team. License information.