package eu.dnetlib.data.mapreduce.hbase.broker;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;

import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

public class CalculatePersonDistributionStep2Reducer extends TableReducer<Text, Text, NullWritable> {

	private static final Log log = LogFactory.getLog(CalculatePersonDistributionStep2Reducer.class);
	private int minPublications = 10;
	private int minPublicationsInRepo = 4;
	private int maxRepos = 10;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		super.setup(context);
		this.minPublications = NumberUtils.toInt(context.getConfiguration().get("MIN_PUBLICATIONS"), 10);
		this.minPublicationsInRepo = NumberUtils.toInt(context.getConfiguration().get("MIN_PUBLICATIONS_IN_REPO"), 4);
		this.maxRepos = NumberUtils.toInt(context.getConfiguration().get("MAX_REPOS"), 10);

		log.info(String.format("Starting with param minPublications=%s, minPublicationsInRepo=%s, maxRepos=%s",
				minPublications, minPublicationsInRepo, maxRepos));
	}

	@Override
	protected void reduce(final Text key, final Iterable<Text> values, final Context context)
			throws IOException, InterruptedException {

		int total = 0;
		final Map<String, Integer> map = Maps.newHashMap();
		for (Text i : values) {
			final String collectedFrom = Bytes.toString(i.copyBytes());
			final Integer count = map.get(collectedFrom);
			map.put(collectedFrom, count == null ? 1 : count + 1);
			total++;
		}

		if (total >= minPublications && map.size() <= maxRepos) {
			final Integer max = Collections.max(map.values());
			if (max >= minPublicationsInRepo) {
				final int perc = 100 * max / total;
				context.getCounter("Max percentage of results in a repo", StringUtils.leftPad(String.valueOf(perc), 3, "0") + " %").increment(1);
			} else {
				context.getCounter("Skipped person", "n pubs in main repo < " + minPublicationsInRepo).increment(1);
			}
		} else if (total < minPublications) {
			context.getCounter("Skipped person", "total pubs < " + minPublications).increment(1);
		} else {
			context.getCounter("Skipped person", "n. repos > " + maxRepos).increment(1);
		}
	}
}
