/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import com.google.common.base.Splitter;
import eu.dnetlib.data.mapreduce.hbase.dedup.experiment.Subjects;
import eu.dnetlib.data.mapreduce.hbase.dedup.experiment.SubjectsMap;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;

public class SubjectParser {
    public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
    private static final int MIN_LENGTH = 5;

    public SubjectsMap parse(Document doc) {
        List subjectNodes = doc.selectNodes("//*[local-name() = 'subject']");
        SubjectsMap subjectMap = new SubjectsMap();
        for (int i = 0; i < subjectNodes.size(); ++i) {
            String value;
            Element e = (Element)subjectNodes.get(i);
            String subject = e.getText();
            String type = this.guessType(subject);
            if (!subjectMap.containsKey(type)) {
                subjectMap.put(type, new Subjects());
            }
            if (!StringUtils.isNotBlank((String)type)) continue;
            if ("keyword".equals(type)) {
                Splitter splitter = Splitter.on((String)",").trimResults().omitEmptyStrings();
                for (String token : splitter.split((CharSequence)subject)) {
                    String value2 = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
                    if (value2.length() < 5) continue;
                    ((Subjects)subjectMap.get(type)).add(value2);
                }
                continue;
            }
            String token = subject.replaceFirst(REGEX_SUBJECT, "$4");
            if (!StringUtils.isNotBlank((String)token) || (value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase()).length() < 5) continue;
            ((Subjects)subjectMap.get(type)).add(value);
        }
        return subjectMap;
    }

    private String guessType(String subject) {
        if (subject.startsWith("info:eu-repo")) {
            String s = subject.replaceAll(REGEX_SUBJECT, "$3");
            return s;
        }
        return "keyword";
    }
}

