package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.Set;

import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.data.mapreduce.JobParams;
import eu.dnetlib.data.mapreduce.util.DedupUtils;
import eu.dnetlib.data.mapreduce.util.OafDecoder;
import eu.dnetlib.data.proto.PersonProtos;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.RandomUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

public class JoinPersonGroupMapper extends Mapper<Text, Text, Text, Text> {

	private static final String SUBJECT_PREFIX = "subject.";
	private static final String COAUTHOR_PREFIX = "coauthor.";

	public static final String PERSON = "person";

	private static final int MAX_TOKENS = 5;
	private static final int MIN_FEATURES = 10;

	private Text outKey;
	private Text outValue;

	private SubjectParser sp;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		outKey = new Text();
		outValue = new Text();

		sp = new SubjectParser();
	}

	@Override
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
		// System.out.println("got key: " + new String(keyIn.copyBytes()));

		final SAXReader r = new SAXReader();
		try {
			final Document doc = r.read(new StringReader(value.toString()));
			final SubjectsMap sm = sp.parse(doc);

			final CsvEntry entry = new CsvEntry();
			for(Subjects subs : sm.values()) {
				for(String subject : subs) {
					final String s = SUBJECT_PREFIX + cleanup(subject);
					entry.addFeature("\"" + s + "\"");
				}
			}

			final List<Person> authors = getAuthors(doc);
			final String title = getTitle(doc);
			final String pubId = getId(doc);

			for(Person p1 : authors) {

				context.getCounter(PERSON, "accurate " + p1.isAccurate()).increment(1);
				final Set<String> hashes = getOutKeys(p1);
				context.getCounter(PERSON, String.format("accurate %s keys", p1.isAccurate())).increment(hashes.size());
				for(String s1 : hashes) {
					//final String s1 = normalize(p1);
					final CsvEntry c = new CsvEntry(s1, entry.getFeatures());
					for (Person p2 : authors) {
						final String s2 = normalize(p2.getSurnameString());
						if (p1.isAccurate() && p2.isAccurate()) {
							if (!p1.getSurnameString().equalsIgnoreCase(p2.getSurnameString())) {
								c.addFeature("\"" + COAUTHOR_PREFIX + s2.replaceAll("\"", "").replaceAll("\\s+", "_") + "\"");
							}
						}
					}

					final String prefix = StringUtils.substringBefore(pubId, "::");
					final String originalId = StringUtils.substringAfter(pubId, "::");

					c.setId(getId(prefix, originalId, p1.getOriginal()));
					c.setOriginalName(p1.getOriginal());
					c.setTitle(title);

					c.getFeatures().remove(s1);

					if (s1.length() <= 3) {
						context.getCounter(PERSON, "key size <= 3").increment(1);
						return;
					}

					if(c.getFeatures().size() < MIN_FEATURES) {
						context.getCounter(PERSON, "features < " + MIN_FEATURES).increment(1);
						return;
					}

					outKey.set(s1);
					outValue.set(c.toString());

					context.write(outKey, outValue);
				}
			}

		} catch (final Throwable e) {
			System.out.println("GOT EX " + e);
			e.printStackTrace(System.err);
			context.getCounter(PERSON, e.getClass().toString()).increment(1);
		}
	}

	protected String getId(final String nsPrefix, final String originalId, final String name) {

		final String localId = name.replaceAll("\\s+", " ").trim();

		// person id doesn't depend on the publication id
		// return AbstractDNetXsltFunctions.oafId(Type.person.toString(), prefix, localId);

		// person id depends on the publication id and the person name
		return AbstractDNetXsltFunctions.oafId(Type.person.toString(), nsPrefix, originalId + "::" + localId);
	}

	private String cleanup(final String s) {
		return s.replaceAll(" ", "_").replaceAll("\\.", "_").replaceAll("\"", "");
	}

	private String getId(final Document doc) {
		return doc.valueOf("//*[local-name() = 'objIdentifier']/text()");
	}

	private List<Person> getAuthors(final Document doc) {
		final List creatorNodes = doc.selectNodes("//*[local-name() = 'creator']");
		final List<Person> authors = Lists.newArrayList();

		for(int i = 0; i<creatorNodes.size(); i++) {
			final Element e = (Element) creatorNodes.get(i);
			authors.add(new Person(e.getText(), false));
		}
		return authors;
	}

	private String getTitle(final Document doc) {
		final List titleNodes = doc.selectNodes("//*[local-name() = 'title']");
		if (titleNodes != null && titleNodes.size() > 0) {
			final Element titleNode = (Element) titleNodes.get(0);

			return titleNode.getText().replaceAll(",", "");
		}
		return "";
	}

	private Set<String> getOutKeys(final Person p1) {
		final Set<String> hashes = Sets.newHashSet();
		if (p1.isAccurate()) {
			for(String name : p1.getName()) {
				hashes.add(normalize(p1.getSurnameString() + firstLC(name)));
			}
		} else {
			final String s = normalize(p1.getOriginal());
			for (final String token1 : tokens(s)) {
				for (final String token2 : tokens(s)) {
					if (!token1.equals(token2)) {
						hashes.add(firstLC(token1) + token2);
					}
				}
			}
		}
		return hashes;
	}

	private String normalize(final Person p) {

		final String s = p.getSurnameString() + firstLC(p.getNameString());
		return normalize(s);
	}

	private String normalize(final String s) {
		return s.replaceAll("[^a-zA-Z ]", "").toLowerCase().trim();
	}

	private Iterable<String> tokens(final String s) {
		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS);
	}

	private String firstLC(final String s) {
		return StringUtils.substring(s, 0, 1).toLowerCase();
	}

}
