package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import java.util.List;

import com.google.common.base.Splitter;
import org.apache.commons.lang.StringUtils;
import org.dom4j.Element;

/**
 * Created by claudio on 25/03/16.
 */
public class SubjectParser {

	public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
	private static final int MIN_LENGTH = 5;

	public SubjectsMap parse(final org.dom4j.Document doc) {

		final List subjectNodes = doc.selectNodes("//*[local-name() = 'subject']");
		final SubjectsMap subjectMap = new SubjectsMap();

		for(int i = 0; i<subjectNodes.size(); i++) {
			final Element e = (Element) subjectNodes.get(i);
			final String subject = e.getText();

			final String type = guessType(subject);
			if (!subjectMap.containsKey(type)) {
				subjectMap.put(type, new Subjects());
			}

			if (StringUtils.isNotBlank(type)) {
				if ("keyword".equals(type)) {
					final Splitter splitter = Splitter.on(",").trimResults().omitEmptyStrings();
					for (String token : splitter.split(subject)) {
						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
						if (value.length() >= MIN_LENGTH) {
							subjectMap.get(type).add(value);
						}
					}
				} else {
					String token = subject.replaceFirst(REGEX_SUBJECT, "$4");

					if (StringUtils.isNotBlank(token)) {
						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
						if (value.length() >= MIN_LENGTH) {
							subjectMap.get(type).add(value);
						}
					}
				}
			}
		}

		return subjectMap;
	}

	private String guessType(final String subject) {
		if (subject.startsWith("info:eu-repo")) {
			final String s = subject.replaceAll(REGEX_SUBJECT, "$3");
			return s;
		} else {
			return "keyword";
		}
	}
}
