package de.lmu.ifi.dbs.elki.datasource.parser;

import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.io.ParseUtil;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Description("This parser expects data in roughly the same format as the NumberVectorLabelParser,\nexcept that it will enumerate all unique strings to always produce numerical values.\nThis way, it can for example handle files that contain lines like 'y,n,y,y,n,y,n'.")
/* loaded from: input_file:de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.class */
public class CategorialDataAsNumberVectorParser<V extends NumberVector> extends NumberVectorLabelParser<V> {
    private static final Logging LOG = Logging.getLogger((Class<?>) CategorialDataAsNumberVectorParser.class);
    Object2IntOpenHashMap<String> unique;
    int ustart;
    Matcher nanpattern;

    /* loaded from: input_file:de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser$Parameterizer.class */
    public static class Parameterizer<V extends NumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
        /* JADX INFO: Access modifiers changed from: protected */
        @Override // de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser.Parameterizer, de.lmu.ifi.dbs.elki.datasource.parser.AbstractStreamingParser.Parameterizer, de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer
        public CategorialDataAsNumberVectorParser<V> makeInstance() {
            return new CategorialDataAsNumberVectorParser<>(this.format, this.labelIndices, this.factory);
        }
    }

    public CategorialDataAsNumberVectorParser(NumberVector.Factory<V> factory) {
        this(CSVReaderFormat.DEFAULT_FORMAT, null, factory);
    }

    public CategorialDataAsNumberVectorParser(CSVReaderFormat cSVReaderFormat, long[] jArr, NumberVector.Factory<V> factory) {
        super(cSVReaderFormat, jArr, factory);
        this.unique = new Object2IntOpenHashMap<>();
        this.ustart = Math.max(this.unique.defaultReturnValue() + 1, 1);
        this.nanpattern = Pattern.compile("\\?").matcher("Dummy text");
    }

    @Override // de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser, de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource
    public BundleStreamSource.Event nextEvent() {
        BundleStreamSource.Event nextEvent = super.nextEvent();
        if (nextEvent == BundleStreamSource.Event.END_OF_STREAM) {
            this.unique.clear();
        }
        return nextEvent;
    }

    @Override // de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser
    protected boolean parseLineInternal() {
        int i = 0;
        while (this.tokenizer.valid()) {
            if (isLabelColumn(i)) {
                this.haslabels = true;
                this.labels.add(this.tokenizer.getSubstring());
            } else {
                try {
                    this.attributes.add(this.tokenizer.getDouble());
                } catch (NumberFormatException e) {
                    String substring = this.tokenizer.getSubstring();
                    if (this.nanpattern.reset(substring).matches()) {
                        this.attributes.add(Double.NaN);
                    } else {
                        if (!this.warnedPrecision && (e == ParseUtil.PRECISION_OVERFLOW || e == ParseUtil.EXPONENT_OVERFLOW)) {
                            getLogger().warning("Too many digits in what looked like a double number - treating as string: " + this.tokenizer.getSubstring());
                            this.warnedPrecision = true;
                        }
                        int i2 = this.unique.getInt(substring);
                        if (i2 == this.unique.defaultReturnValue()) {
                            i2 = this.ustart + this.unique.size();
                            this.unique.put((Object2IntOpenHashMap<String>) substring, i2);
                        }
                        this.attributes.add(i2);
                    }
                }
            }
            this.tokenizer.advance();
            i++;
        }
        this.curvec = createVector();
        this.curlbl = LabelList.make(this.labels);
        this.attributes.clear();
        this.labels.clear();
        return true;
    }

    @Override // de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser, de.lmu.ifi.dbs.elki.datasource.parser.AbstractStreamingParser
    protected Logging getLogger() {
        return LOG;
    }
}
