package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import java.io.IOException;

import java.util.List;
import java.util.Set;

import com.google.common.base.Function;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class JoinPersonGroupReducer extends Reducer<Text, Text, Text, Text> {

	/**
	 * logger.
	 */
	private static final Log log = LogFactory.getLog(JoinPersonGroupReducer.class); // NOPMD by marko on 11/24/08 5:02 PM

	private Text tKey;
	private Text tValue;

	private final static int MIN_ENTRIES_THRESHOLD = 1;
	private int minEntriesThreshold;

	private final static int MAX_ENTRIES_THRESHOLD = Integer.MAX_VALUE;
	private int maxEntriesThreshold;

	private final static int MAX_FEATURES_THRESHOLD = Integer.MAX_VALUE;
	private int maxFeaturesThreshold;

	private Set<String> knownHashValues = Sets.newHashSet();

	private boolean passAll = false;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		super.setup(context);
		tKey = new Text("");
		tValue = new Text();

		minEntriesThreshold = context.getConfiguration().getInt("min.entries.threshold", MIN_ENTRIES_THRESHOLD);
		maxEntriesThreshold = context.getConfiguration().getInt("max.entries.threshold", MAX_ENTRIES_THRESHOLD);
		maxFeaturesThreshold = context.getConfiguration().getInt("max.features.threshold", MAX_FEATURES_THRESHOLD);

		final String hashCsv = context.getConfiguration().get("hash.values.csv", "");

		log.info("hash csv: " + hashCsv);
		if (hashCsv.contains("ALL")) {
			passAll = true;
		}

		for(String hash : Splitter.on(",").omitEmptyStrings().trimResults().split(hashCsv)) {
			knownHashValues.add(hash);
		}

	}

	@Override
	protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {

		final CsvSerialiser csvSerialiser = new CsvSerialiser(maxEntriesThreshold, maxFeaturesThreshold);
		final String outKey = key.toString().replaceAll("[^a-zA-Z ]", "").toLowerCase();

		if (!passAll && !knownHashValues.contains(outKey)) {
			return;
		}

		if (StringUtils.isBlank(outKey)) {
			context.getCounter("person", "blank key").increment(1);
			return;
		}

		final List<CsvEntry> entries = Lists.newArrayList(Iterables.transform(values, new Function<Text, CsvEntry>() {

			@Override
			public CsvEntry apply(final Text t) {
				return CsvEntry.fromJson(t.toString());
			}
		}));

		trackPersonInfo(entries.size(), context, "person");

		if (entries.size() < minEntriesThreshold || entries.size() > maxEntriesThreshold) {
			return;
		}

		if (!passAll) {
			context.getCounter("person hash", outKey).increment(entries.size());
		}

		//tKey.set(outKey);
		tValue.set(csvSerialiser.asCSV(entries));
		context.write(tKey, tValue);

		context.getCounter("person", "csv").increment(1);
	}

	private void trackPersonInfo(final int count, final Context context, final String counterName) {

		if (count > 0 && count <= 10) {
			context.getCounter(counterName, count + "").increment(1);
			return;
		}

		if (count > 10 && count <= 20) {
			context.getCounter(counterName, "[10, 20)").increment(1);
			return;
		}

		if (count > 20 && count <= 30) {
			context.getCounter(counterName, "[20, 30)").increment(1);
			return;
		}

		if (count > 30 && count <= 40) {
			context.getCounter(counterName, "[30, 40)").increment(1);
			return;
		}

		if (count > 40 && count <= 50) {
			context.getCounter(counterName, "[40, 50)").increment(1);
			return;
		}

		if (count > 50 && count <= 70) {
			context.getCounter(counterName, "[50, 70)").increment(1);
			return;
		}

		if (count > 70 && count <= 100) {
			context.getCounter(counterName, "[70, 100)").increment(1);
			return;
		}

		if (count > 100 && count <= 150) {
			context.getCounter(counterName, "[100, 150)").increment(1);
			return;
		}

		if (count > 150 && count <= 200) {
			context.getCounter(counterName, "[150, 200)").increment(1);
			return;
		}

		if (count > 200) {
			context.getCounter(counterName, "[200, *)").increment(1);
			return;
		}
	}

	@Override
	public void cleanup(final Context context) throws IOException, InterruptedException {
		super.cleanup(context);
	}

}
