package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

import com.google.common.collect.Lists;

import eu.dnetlib.data.mapreduce.util.DedupUtils;
import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DynConf;
import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
import eu.dnetlib.pace.model.MapDocumentSerializer;
import eu.dnetlib.pace.util.DedupConfig;
import eu.dnetlib.pace.util.DedupConfigLoader;

public class DedupReducer extends TableReducer<Text, ImmutableBytesWritable, ImmutableBytesWritable> {

	private static final boolean WRITE_TO_WAL = false;
	// private static final int LIMIT = 2000;
	// private static final int FIELD_LIMIT = 10;
	// private static final int WINDOW_SIZE = 200;

	private Config paceConf;
	private DedupConfig dedupConf;

	private ImmutableBytesWritable ibw;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		paceConf = DynConf.load(context.getConfiguration().get("dedup.pace.conf"));
		dedupConf = DedupConfigLoader.load(context.getConfiguration().get("dedup.wf.conf"));
		ibw = new ImmutableBytesWritable();

		System.out.println("dedup reduce phase \npace conf: " + paceConf.fields() + "\nwf conf: " + dedupConf.toString());
	}

	@Override
	protected void reduce(final Text key, final Iterable<ImmutableBytesWritable> values, final Context context) throws IOException, InterruptedException {
		System.out.println("\nReducing key: '" + key + "'");

		final Queue<MapDocument> q = prepare(context, key, values);
		switch (Type.valueOf(dedupConf.getEntityType())) {
		case person:
			process(q, context);
			break;
		case result:
			process(simplifyQueue(q, key.toString(), context), context);
			break;
		case organization:
			process(q, context);
			break;
		default:
			throw new IllegalArgumentException("dedup not implemented for type: " + dedupConf.getEntityType());
		}
	}

	private Queue<MapDocument> prepare(final Context context, final Text key, final Iterable<ImmutableBytesWritable> values) {
		final Queue<MapDocument> queue = new PriorityQueue<MapDocument>(100, new MapDocumentComparator(dedupConf.getOrderField()));

		final Set<String> seen = new HashSet<String>();

		for (ImmutableBytesWritable i : values) {
			MapDocument doc = MapDocumentSerializer.decode(i.copyBytes());
			String id = doc.getIdentifier();

			if (!seen.contains(id)) {
				seen.add(id);
				queue.add(doc);
			}

			if (queue.size() > dedupConf.getQueueMaxSize()) {
				// context.getCounter("ngram size > " + LIMIT, "'" + key.toString() + "', --> " + context.getTaskAttemptID()).increment(1);
				context.getCounter("ngram size > " + dedupConf.getQueueMaxSize(), "N").increment(1);
				System.out.println("breaking out after limit (" + dedupConf.getQueueMaxSize() + ") for ngram '" + key);
				break;
			}
		}

		return queue;
	}

	private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final Context context) {
		final Queue<MapDocument> q = new LinkedList<MapDocument>();

		String fieldRef = "";
		List<MapDocument> tempResults = Lists.newArrayList();

		while (!queue.isEmpty()) {
			MapDocument result = queue.remove();

			if (!result.values(dedupConf.getOrderField()).isEmpty()) {
				String field = NGramUtils.cleanupForOrdering(result.values(dedupConf.getOrderField()).stringValue());
				if (field.equals(fieldRef)) {
					tempResults.add(result);
				} else {
					populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
					tempResults.clear();
					tempResults.add(result);
					fieldRef = field;
				}
			} else {
				context.getCounter(dedupConf.getEntityType(), "missing " + dedupConf.getOrderField()).increment(1);
			}
		}
		populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);

		return q;
	}

	private void populateSimplifiedQueue(final Queue<MapDocument> q,
			final List<MapDocument> tempResults,
			final Context context,
			final String fieldRef,
			final String ngram) {
		if (tempResults.size() < dedupConf.getGroupMaxSize()) {
			q.addAll(tempResults);
		} else {
			context.getCounter(dedupConf.getEntityType(), "Skipped records for count(" + dedupConf.getOrderField() + ") >= " + dedupConf.getGroupMaxSize())
			.increment(tempResults.size());
			System.out.println("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
		}
	}

	private void process(final Queue<MapDocument> queue, final Context context) throws IOException, InterruptedException {

		final PaceDocumentDistance algo = new PaceDocumentDistance();

		while (!queue.isEmpty()) {

			final MapDocument pivot = queue.remove();
			final String idPivot = pivot.getIdentifier();

			final FieldList fieldsPivot = pivot.values(dedupConf.getOrderField());
			final String fieldPivot = fieldsPivot == null || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();

			if (fieldPivot != null) {
				// System.out.println(idPivot + " --> " + fieldPivot);

				int i = 0;
				for (MapDocument curr : queue) {
					final String idCurr = curr.getIdentifier();

					if (mustSkip(idCurr)) {
						context.getCounter(dedupConf.getEntityType(), "skip list").increment(1);
						break;
					}

					if (i > dedupConf.getSlidingWindowSize()) {
						break;
					}

					final FieldList fieldsCurr = curr.values(dedupConf.getOrderField());
					final String fieldCurr = fieldsCurr == null || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();

					if (!idCurr.equals(idPivot) && fieldCurr != null) {

						double d = algo.between(pivot, curr, paceConf);

						if (d >= dedupConf.getThreshold()) {
							writeSimilarity(context, idPivot, idCurr);
							context.getCounter(dedupConf.getEntityType(), SubRelType.dedupSimilarity.toString() + " (x2)").increment(1);
						} else {
							context.getCounter(dedupConf.getEntityType(), "d < " + dedupConf.getThreshold()).increment(1);
						}
						i++;
					}
				}
			}
		}
	}

	private boolean mustSkip(final String idPivot) {
		return dedupConf.getSkipList().contains(getNsPrefix(idPivot));
	}

	private String getNsPrefix(final String id) {
		return StringUtils.substringBetween(id, "|", "::");
	}

	private void writeSimilarity(final Context context, final String idPivot, final String id) throws IOException, InterruptedException {
		byte[] rowKey = Bytes.toBytes(idPivot);
		byte[] target = Bytes.toBytes(id);

		emitRel(context, rowKey, target);
		emitRel(context, target, rowKey);
	}

	private void emitRel(final Context context, final byte[] from, final byte[] to) throws IOException, InterruptedException {
		Put put = new Put(from).add(DedupUtils.getSimilarityCFBytes(Type.valueOf(dedupConf.getEntityType())), to, Bytes.toBytes(""));
		put.setWriteToWAL(WRITE_TO_WAL);
		ibw.set(from);
		context.write(ibw, put);
	}
}
