package eu.dnetlib.data.mapreduce.hbase.dataimport;

import java.io.IOException;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.DocumentResult;
import org.dom4j.io.DocumentSource;
import org.dom4j.io.SAXReader;

public class FilterXmlRecordsMapper extends Mapper<Text, Text, Text, Text> {

	private static final Log log = LogFactory.getLog(ImportRecordsMapper.class); // NOPMD by marko on 11/24/08 5:02 PM

	private Transformer transformer;

	private SAXReader saxReader;

	private final static String xslt =
			"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
					+ "  <xsl:template match=\"@*|node()\">\n"
					+ "    <xsl:copy>\n"
					+ "      <xsl:apply-templates select=\"@*|node()\"/>\n"
					+ "    </xsl:copy>\n"
					+ "  </xsl:template>\n"
					+ "</xsl:stylesheet>";

	@Override
	protected void setup(final Context context) throws IOException, InterruptedException {
		super.setup(context);

		saxReader = new SAXReader();

		log.info("using xslt:\n" + xslt);
		try {
			transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource((new SAXReader()).read(new StringReader(xslt))));
		} catch (TransformerConfigurationException | DocumentException e) {
			log.error(e);
			throw new RuntimeException(e);
		}

		log.info("using trasformer: '" + transformer.getClass().getName() + "'");
	}

	@Override
	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
		try {
			final DocumentResult result = new DocumentResult();
			final Document document = saxReader.read(new StringReader(value.toString()));
			transformer.transform(new DocumentSource(document), result);

			context.write(key, new Text(result.getDocument().asXML()));

		} catch (final Throwable e) {
			//log.error("error parsing record\n" + value.toString(), e);

			context.getCounter("error", e.getClass().getName()).increment(1);

			final String c = getInvalidXmlChar(e);
			if (StringUtils.isNotBlank(c)) {
				context.getCounter("invalid char", c).increment(1);
			}
		}
	}

	public static String getInvalidXmlChar(final Throwable e) {
		final String error = ExceptionUtils.getRootCauseMessage(e);
		if (StringUtils.contains(error, "An invalid XML character")) {
			final Pattern p = Pattern.compile(".*\\(.*:\\s?(?<char>.*)\\).*");
			final Matcher m = p.matcher(error);
			if (m.matches()) {
				final String c = m.group("char");
				if (StringUtils.isNotBlank(c)) {
					return c;
				}
			}
		}
		return null;
	}

}
