/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.data.mapreduce.hbase.dataimport;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.DocumentResult;
import org.dom4j.io.DocumentSource;
import org.dom4j.io.SAXReader;

public class GetInvalidXmlRecordsMapper
extends Mapper<Text, Text, Text, Text> {
    private static final Log log = LogFactory.getLog(GetInvalidXmlRecordsMapper.class);
    public static final String DOI_REGEX = "(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\\\"&\\'])\\S)+)";
    private Transformer transformer;
    private SAXReader saxReader;
    private Text valueOut;
    private static final String xslt = "<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n  <xsl:template match=\"@*|node()\">\n    <xsl:copy>\n      <xsl:apply-templates select=\"@*|node()\"/>\n    </xsl:copy>\n  </xsl:template>\n</xsl:stylesheet>";

    protected void setup(Mapper.Context context) throws IOException, InterruptedException {
        super.setup(context);
        this.valueOut = new Text();
        this.saxReader = new SAXReader();
        log.info((Object)"using xslt:\n<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n  <xsl:template match=\"@*|node()\">\n    <xsl:copy>\n      <xsl:apply-templates select=\"@*|node()\"/>\n    </xsl:copy>\n  </xsl:template>\n</xsl:stylesheet>");
        try {
            this.transformer = TransformerFactory.newInstance().newTransformer((Source)new DocumentSource(new SAXReader().read((Reader)new StringReader(xslt))));
        }
        catch (TransformerConfigurationException | DocumentException e) {
            log.error((Object)e);
            throw new RuntimeException(e);
        }
        log.info((Object)("using trasformer: '" + this.transformer.getClass().getName() + "'"));
    }

    protected void map(Text key, Text value, Mapper.Context context) throws IOException, InterruptedException {
        try {
            DocumentResult result = new DocumentResult();
            Document document = this.saxReader.read((Reader)new StringReader(value.toString()));
            this.transformer.transform((Source)new DocumentSource(document), (Result)result);
            result.getDocument().asXML();
        }
        catch (Throwable e) {
            String doi;
            context.getCounter("error", e.getClass().getName()).increment(1L);
            String c = GetInvalidXmlRecordsMapper.getInvalidXmlChar(e);
            if (StringUtils.isNotBlank((CharSequence)c)) {
                context.getCounter("invalid char", c).increment(1L);
            }
            if (StringUtils.isNotBlank((CharSequence)(doi = GetInvalidXmlRecordsMapper.getDoi(value.toString())))) {
                context.getCounter("output", "doi").increment(1L);
            }
            this.valueOut.set(value.toString());
            context.write((Object)key, (Object)this.valueOut);
        }
    }

    public static String getInvalidXmlChar(Throwable e) {
        String c;
        Pattern p;
        Matcher m;
        String error = ExceptionUtils.getRootCauseMessage((Throwable)e);
        if (StringUtils.contains((CharSequence)error, (CharSequence)"An invalid XML character") && (m = (p = Pattern.compile(".*\\(.*:\\s?(?<char>.*)\\).*")).matcher(error)).matches() && StringUtils.isNotBlank((CharSequence)(c = m.group("char")))) {
            return c;
        }
        return null;
    }

    public static String getDoi(String url) {
        Pattern pattern = Pattern.compile(DOI_REGEX);
        Matcher matcher = pattern.matcher(url);
        if (matcher.find()) {
            return matcher.group(0);
        }
        return null;
    }
}

