package eu.dnetlib.data.mapreduce.hbase.dedup;

import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import com.google.common.collect.Maps;
import eu.dnetlib.data.mapreduce.JobParams;
import eu.dnetlib.data.mapreduce.util.DedupUtils;
import eu.dnetlib.data.mapreduce.util.OafDecoder;
import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.TypeProtos.Type;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobTracker.IllegalStateException;

public class DedupMapper extends TableMapper<Text, ImmutableBytesWritable> {

	private static final Log log = LogFactory.getLog(DedupMapper.class);

	private DedupConfig dedupConf;

	private Map<String, List<String>> blackListMap = Maps.newHashMap();

	private Text outKey;

	private ImmutableBytesWritable ibw;

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {

		final String dedupConfJson = context.getConfiguration().get(JobParams.DEDUP_CONF);

		log.info("pace conf strings");
		log.info("pace conf: " + dedupConfJson);

		dedupConf = DedupConfig.load(dedupConfJson);

		blackListMap = dedupConf.getPace().getBlacklists();

		outKey = new Text();
		ibw = new ImmutableBytesWritable();

		//log.info("pace conf");
		//log.info("entity type: " + dedupConf.getWf().getEntityType());
		//log.info("clustering: " + dedupConf.getPace().getClustering());
		//log.info("conditions: " + dedupConf.getPace().getConditions());
		//log.info("fields: " + dedupConf.getPace().getModel());
		//log.info("blacklists: " + blackListMap);
		log.info("wf conf: " + dedupConf.toString());
	}

	@Override
	protected void map(final ImmutableBytesWritable keyIn, final Result result, final Context context) throws IOException, InterruptedException {
		// log.info("got key: " + new String(keyIn.copyBytes()));

		final WfConfig wf = dedupConf.getWf();
		final byte[] body = result.getValue(wf.getEntityType().getBytes(), DedupUtils.BODY_B);

		if (body != null) {

			final OafDecoder decoder = OafDecoder.decode(body);
			if (decoder.getOaf().getDataInfo().getDeletedbyinference()) {
				context.getCounter(wf.getEntityType(), "deleted by inference").increment(1);
				return;
			}

			final OafEntity entity = decoder.getEntity();

			context.getCounter(entity.getType().toString(), "decoded").increment(1);

			if (entity.getType().equals(Type.valueOf(wf.getEntityType()))) {

				final MapDocument doc = ProtoDocumentBuilder.newInstance(Bytes.toString(keyIn.copyBytes()), entity, dedupConf.getPace().getModel());
				context.getCounter(entity.getType().toString(), "converted as MapDocument").increment(1);

				if (wf.hasSubType()) {

					final Map<String, Field> fields = doc.getFieldMap();

					if (!fields.containsKey(wf.getSubEntityType())) {
						throw new IllegalStateException(String.format("model map does not contain field %s", wf.getSubEntityType()));
					}

					final String subType = fields.get(wf.getSubEntityType()).stringValue();
					if (wf.getSubEntityValue().equalsIgnoreCase(subType)) {
						context.getCounter(subType, "converted as MapDocument").increment(1);
						emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
					} else {
						context.getCounter(subType, "ignored").increment(1);
					}
				} else {
					emitNGrams(context, doc, BlacklistAwareClusteringCombiner.filterAndCombine(doc, dedupConf, blackListMap));
				}
			}
		} else {
			context.getCounter(wf.getEntityType(), "missing body").increment(1);
		}
	}

	private void emitNGrams(final Context context, final MapDocument doc, final Collection<String> ngrams) throws IOException, InterruptedException {
		for (final String ngram : ngrams) {
			outKey.set(ngram);
			ibw.set(doc.toByteArray());
			context.write(outKey, ibw);
		}
	}

}
