package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import java.io.StringWriter;
import java.util.List;
import java.util.Set;

import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;

/**
 * Created by claudio on 26/04/16.
 */
public class CsvSerialiser {

	private final static int MAX_FEATURES = 1000;
	private final static int MAX_ROWS = 5000;

	private int maxRows = MAX_ROWS;
	private int maxFeatures = MAX_FEATURES;

	public CsvSerialiser() {
	}

	public CsvSerialiser(int maxRows, int maxFeatures) {
		this.maxRows = maxRows;
		this.maxFeatures = maxFeatures;
	}

	public String asCSV(final List<CsvEntry> list) {
		final Set<String> features = Sets.newLinkedHashSet();

		for(CsvEntry e : Iterables.limit(list, maxRows)) {
			features.addAll(e.getFeatures());
		}

		final List<String> cappedFeatures = Lists.newLinkedList(Iterables.limit(features, maxFeatures));
		//context.getCounter("person", "features " + Iterables.size(cappedFeatures)).increment(1);

		final StringWriter csv = new StringWriter();
		csv.append("\"k\",");
		csv.append(Joiner.on(",").join(cappedFeatures));
		csv.append(",\"id\",\"name\",\"title\"\n");
		for(CsvEntry e : Iterables.limit(list, maxRows)) {

			boolean hasZero = false;
			boolean hasOne = false;

			final StringWriter line = new StringWriter();
			line.append(e.getKey()+",");
			for(String f : cappedFeatures) {
				if(e.getFeatures().contains(f)) {
					line.append("1,");
					hasOne = true;
				} else {
					line.append("0,");
					hasZero = true;
				}
			}
			line.append("\""+e.getId()+"\",");
			line.append("\""+e.getOriginalName()+"\",");
			line.append("\""+e.getTitle()+"\"");

			if (hasZero && hasOne) {
				csv.append(line.toString() + "\n");
			}
			//csv.append(StringUtils.substringBeforeLast(line.toString(), ",")  + "\n");
		}

		return csv.toString();
	}

}
