package eu.dnetlib.iis.collapsers;

import eu.dnetlib.iis.core.common.AvroUtils;
import eu.dnetlib.iis.core.java.HadoopContext;
import eu.dnetlib.iis.core.java.PortBindings;
import eu.dnetlib.iis.core.java.Process;
import eu.dnetlib.iis.core.java.io.DataStore;
import eu.dnetlib.iis.core.java.io.FileSystemPath;
import eu.dnetlib.iis.core.java.porttype.AvroPortType;
import eu.dnetlib.iis.core.java.porttype.PortType;
import java.util.*;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/** 
 * @author Dominika Tkaczyk
 */
/** Mafju review: Minor problem: this class does not obey our conventions for 
 * specifying types and number of input ports - the `getInputPorts()` 
 * method returns empty map. This could be easily amended by accepting 
 * appropriate data in the constructor of the `Process` and creating appropriate 
 * description of the input there. The schema returned in `getOutputPorts()` 
 * is also set in the `run()` method, which is bad, since `getOutputPorts()` 
 * should work even when `run()` is not executed before.
 */
public class Union implements Process {
	
    private final static String outputPort = "output";
	
    private Schema inputSchema;
    private Schema outputSchema;

	@Override
	public Map<String, PortType> getInputPorts() {
        return new HashMap<String, PortType>();
	}

	@Override
	public Map<String, PortType> getOutputPorts() {
		HashMap<String, PortType> outputPorts = 
				new HashMap<String, PortType>();
		outputPorts.put(outputPort, 
				new AvroPortType(outputSchema));
		return outputPorts;	
	}

	@Override
	public void run(PortBindings portBindings, HadoopContext context,
			Map<String, String> parameters) throws Exception {
        if (parameters.get("origins") == null) {
            throw new RuntimeException("No origins parameter passed!");
        }
        if (parameters.get("input_ports") == null) {
            throw new RuntimeException("No input_ports parameter passed!");
        }
        if (parameters.get("input_schema") == null) {
            throw new RuntimeException("No input_schema parameter passed!");
        }
        if (parameters.get("output_schema") == null) {
            throw new RuntimeException("No output_schema parameter passed!");
        }
        
        /** Mafju review: there should be a check if the number of input ports 
         * equals the number of origins (since we assume that these two things 
         * correspond to each other).
         */
        List<String> origins = Arrays.asList(parameters.get("origins").split(","));
        List<String> inputPorts = Arrays.asList(parameters.get("input_ports").split(","));
        
        inputSchema = AvroUtils.toSchema(parameters.get("input_schema"));
        outputSchema = AvroUtils.toSchema(parameters.get("output_schema"));
        List<Field> fields = outputSchema.getFields();
        if (fields.size() != 2 || (!"origin".equals(fields.get(0).name()) && !"origin".equals(fields.get(1).name()))) {
            throw new RuntimeException("Output schema: " + outputSchema + " should contain two fields, one of them named \"origin\"!");
        }
        Field dataField = fields.get(0);
        if ("origin".equals(fields.get(0).name())) {
            dataField = fields.get(1);
        }
        if (!inputSchema.equals(dataField.schema())) {
            throw new RuntimeException("Schemas: " + inputSchema + " and " + dataField.schema() + " are not the same!");
        }
        
        FileSystem fs = FileSystem.get(context.getConfiguration());
        
        DataFileWriter<GenericRecord> output = 
                DataStore.create( 
                    new FileSystemPath(fs, portBindings.getOutput().get("output")),
                    outputSchema);
        
        int index = 0;
        for (String inputPort : inputPorts) {
            String origin = origins.get(index++);
            
            /** Mafju review: generic type should be specified */
            Iterator it = DataStore.getReader(
                    new FileSystemPath(fs, new Path(inputPort)), inputSchema);
            while (it.hasNext()){
                GenericRecord outputRecord = new GenericData.Record(outputSchema);
                outputRecord.put("origin", origin);
                outputRecord.put(dataField.name(), it.next());
                output.append(outputRecord);
            }
        }
        
        output.close();
	}
	
}
