package eu.dnetlib.functionality.index.solr.feed;

import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.solr.common.SolrInputDocument;

import com.google.common.collect.Lists;

/**
 * Optimized version of the document parser, drop in replacement of InputDocumentFactory.
 * 
 * <p>
 * Faster because:
 * </p>
 * <ul>
 * <li>Doesn't create a DOM for the full document</li>
 * <li>Doesn't execute xpaths agains the DOM</li>
 * <li>Quickly serialize the 'result' element directly in a string.</li>
 * <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
 * </ul>
 * 
 * <p>
 * This class is fully reentrant and can be invoked in parallel.
 * </p>
 * 
 * @author marko
 * 
 */
public class StreamingInputDocumentFactory extends InputDocumentFactory {

	protected static final String DNETRESULT = "dnetResult";

	protected static final String TARGETFIELDS = "targetFields";

	protected static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";

	protected ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
		@Override
		protected XMLInputFactory initialValue() {
			return XMLInputFactory.newInstance();
		}
	};

	protected ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
		@Override
		protected XMLOutputFactory initialValue() {
			return XMLOutputFactory.newInstance();
		}
	};

	protected ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
		@Override
		protected XMLEventFactory initialValue() {
			return XMLEventFactory.newInstance();
		}
	};

	/**
	 * {@inheritDoc}
	 * 
	 * @throws XMLStreamException
	 * @throws DocumentException
	 * 
	 * @see eu.dnetlib.functionality.index.solr.feed.InputDocumentFactory#parseDocument(eu.dnetlib.functionality.index.solr.feed.IndexDocument,
	 *      java.lang.String)
	 */
	@Override
	public SolrInputDocument parseDocument(final String version, final String inputDocument, String dsId) {

		final StringWriter results = new StringWriter();

		try {
			XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));

			final SolrInputDocument indexDocument = new SolrInputDocument();

			while (parser.hasNext()) {
				final XMLEvent event = parser.nextEvent();
				if (event != null && event.isStartElement()) {
					final String localName = event.asStartElement().getName().getLocalPart();

					if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
						final XMLEvent text = parser.nextEvent();
						String recordId = getText(text);
						indexDocument.addField(INDEX_RECORD_ID, recordId);
					} else if (TARGETFIELDS.equals(localName)) {
						parseTargetFields(indexDocument, parser);
					} else if (DNETRESULT.equals(localName)) {
						copyResult(indexDocument, results, parser);
					}
				}
			}

			if (version != null) {
				indexDocument.addField(DS_VERSION, version);
			}

			if (dsId != null) {
				indexDocument.addField(DS_ID, dsId);
			}

			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
				indexDocument.clear();
				System.err.println("missing indexrecord id:\n" + inputDocument);
			}

			return indexDocument;
		} catch (XMLStreamException e) {
			return new SolrInputDocument();
		}
	}

	/**
	 * Parse the targetFields block and add fields to the solr document.
	 * 
	 * @param indexDocument
	 * @param parser
	 * @throws XMLStreamException
	 */
	protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {

		boolean hasFields = false;

		while (parser.hasNext()) {
			final XMLEvent targetEvent = parser.nextEvent();
			if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
				break;
			}

			if (targetEvent.isStartElement()) {
				final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
				final XMLEvent text = parser.nextEvent();

				String data = getText(text);

				addField(indexDocument, fieldName, data);
				hasFields = true;
			}
		}

		if (!hasFields) {
			indexDocument.clear();
		}
	}

	/**
	 * Copy the /indexRecord/result element and children, preserving namespace declarations etc.
	 * 
	 * @param indexDocument
	 * @param results
	 * @param parser
	 * @throws XMLStreamException
	 */
	protected void copyResult(final SolrInputDocument indexDocument, final StringWriter results, final XMLEventReader parser) throws XMLStreamException {
		final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);

		// TODO: newRecord should copy all the namespace prefixes setup in parents
		// fortunately the only parent of the result element is the 'indexrecord', so it should be easy to get
		// the namespaces declared on the root element (and fast)

		final ArrayList<Namespace> namespaces = Lists.newArrayList(
				eventFactory.get().createNamespace("dri", "http://www.driver-repository.eu/namespace/dri"),
				eventFactory.get().createNamespace("dr", "http://www.driver-repository.eu/namespace/dr"),
				eventFactory.get().createNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance"),
				eventFactory.get().createNamespace("dc", "http://purl.org/dc/elements/1.1/"),
				eventFactory.get().createNamespace("oaf", "http://namespace.openaire.eu/oaf"));

		StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, namespaces.iterator());

		// new root record
		writer.add(newRecord);

		// copy the rest as it is
		while (parser.hasNext()) {
			final XMLEvent resultEvent = parser.nextEvent();

			// TODO: replace with depth tracking instead of close tag tracking. 
			if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(DNETRESULT)) {
				writer.add(eventFactory.get().createEndElement("", null, RESULT));
				break;
			}

			writer.add(resultEvent);
		}
		writer.close();

		indexDocument.addField(INDEX_RESULT, results);
	}

	/**
	 * Helper used to add a field to a solr doc. It avoids to add empy fields
	 * 
	 * @param indexDocument
	 * @param field
	 * @param value
	 */
	private final void addField(SolrInputDocument indexDocument, String field, String value) {
		String cleaned = value.trim();
		if (!cleaned.isEmpty()) {
			//log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
			indexDocument.addField(field.toLowerCase(), cleaned);
		}
	}

	/**
	 * Helper used to get the string from a text element.
	 * 
	 * @param text
	 * @return
	 */
	protected final String getText(XMLEvent text) {
		if (text.isEndElement()) {
			//			log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
			return "";
		}

		return text.asCharacters().getData();
	}

}
