package eu.dnetlib.data.mapreduce.hbase.openorgs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import com.google.protobuf.InvalidProtocolBufferException;
import com.googlecode.protobuf.format.JsonFormat;

import eu.dnetlib.data.proto.OafProtos.Oaf;
import eu.dnetlib.data.proto.OafProtos.OafEntity;

public class GenerateOrganizationsReducer extends Reducer<ImmutableBytesWritable, ImmutableBytesWritable, NullWritable, Text> {

	private final Text valueOut = new Text();

	@Override
	protected void reduce(final ImmutableBytesWritable key, final Iterable<ImmutableBytesWritable> values, final Context context)
			throws IOException, InterruptedException {

		try {
			final List<OafEntity> list = new ArrayList<>();
			for (final ImmutableBytesWritable ibw : values) {
				list.add(Oaf.parseFrom(ibw.get()).getEntity());
			}

			final OafEntity e1 = findOrgToEmit(OpenOrgsCommon.OPENORGS_MAIN_PREFIX, list);
			if (e1 != null) {
				context.getCounter("organization", "already present in openOrgs").increment(1);;
				return;
			}
			final OafEntity e2 = findOrgToEmit(OpenOrgsCommon.OPENORGS_CORDA_FP7_PREFIX, list);
			if (e2 != null) {
				context.getCounter("organization", "new (from corda FP7)").increment(1);
				emit(e2, context);
				return;
			}
			final OafEntity e3 = findOrgToEmit(OpenOrgsCommon.OPENORGS_CORDA_H2020_PREFIX, list);
			if (e3 != null) {
				context.getCounter("organization", "new (from corda H2020)").increment(1);
				emit(e3, context);
				return;
			}
			final OafEntity e4 = findOrgToEmit("", list);
			if (e4 != null) {
				context.getCounter("organization", "new (from other sources)").increment(1);
				emit(e4, context);
				return;
			}
		} catch (final InvalidProtocolBufferException e) {
			e.printStackTrace();
			throw new RuntimeException(e);
		}
	}

	private OafEntity findOrgToEmit(final String idPrefix, final List<OafEntity> list) {
		final List<OafEntity> valids = new ArrayList<>();

		for (final OafEntity e : list) {
			if (e.getId().startsWith("20|" + idPrefix)) {
				valids.add(e);
			}
		}
		if (valids.isEmpty()) { return null; }

		valids.sort((o1, o2) -> StringUtils.compare(o1.getId(), o2.getId()));

		return valids.get(0);
	}

	private void emit(final OafEntity entity, final Context context) {
		try {
			context.getCounter("organization", "new (total)").increment(1);
			valueOut.set(JsonFormat.printToString(entity));
			context.write(NullWritable.get(), valueOut);
		} catch (IOException | InterruptedException e) {
			throw new RuntimeException(e);
		}
	}

}
