/*
 * Decompiled with CFR 0.152.
 */
package crawlercommons.sitemaps;

import crawlercommons.filters.URLFilter;
import crawlercommons.mimetypes.MimeTypeDetector;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.SkipLeadingWhiteSpaceInputStream;
import crawlercommons.sitemaps.UnknownFormatException;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.sax.DelegatorHandler;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class SiteMapParser {
    public static final Logger LOG = LoggerFactory.getLogger(SiteMapParser.class);
    private static final int MAX_URLS = 50000;
    public static final int MAX_BYTES_ALLOWED = 0x3200000;
    protected boolean strict = true;
    private boolean allowPartial = false;
    protected boolean strictNamespace = false;
    protected Set<String> acceptedNamespaces = new HashSet<String>();
    protected Map<String, Extension> extensionNamespaces = new HashMap<String, Extension>();
    private MimeTypeDetector mimeTypeDetector;
    private boolean allowDocTypeDefinitions = false;
    private Function<String, String> urlFilter = url -> url;

    public SiteMapParser() {
        this(true, false);
    }

    public SiteMapParser(boolean strict) {
        this(strict, false);
    }

    public SiteMapParser(boolean strict, boolean allowPartial) {
        this.strict = strict;
        this.allowPartial = allowPartial;
        this.mimeTypeDetector = new MimeTypeDetector();
    }

    public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) {
        this.allowDocTypeDefinitions = allowDocTypeDefinitions;
    }

    public boolean isStrict() {
        return this.strict;
    }

    public boolean isStrictNamespace() {
        return this.strictNamespace;
    }

    public void setStrictNamespace(boolean s) {
        this.strictNamespace = s;
        if (this.strictNamespace) {
            this.addAcceptedNamespace("http://www.sitemaps.org/schemas/sitemap/0.9");
        }
    }

    public void addAcceptedNamespace(String namespaceUri) {
        this.acceptedNamespaces.add(namespaceUri);
    }

    public void addAcceptedNamespace(String[] namespaceUris) {
        for (String namespaceUri : namespaceUris) {
            this.acceptedNamespaces.add(namespaceUri);
        }
    }

    public void enableExtension(Extension extension) {
        for (String namespaceUri : Namespace.SITEMAP_EXTENSION_NAMESPACES.get((Object)extension)) {
            this.extensionNamespaces.put(namespaceUri, extension);
        }
    }

    public void enableExtensions() {
        for (Extension extension : Extension.values()) {
            for (String namespaceUri : Namespace.SITEMAP_EXTENSION_NAMESPACES.get((Object)extension)) {
                this.extensionNamespaces.put(namespaceUri, extension);
            }
        }
    }

    public void setURLFilter(Function<String, String> filter) {
        this.urlFilter = filter;
    }

    public void setURLFilter(URLFilter filter) {
        this.urlFilter = filter::filter;
    }

    public AbstractSiteMap parseSiteMap(URL onlineSitemapUrl) throws UnknownFormatException, IOException {
        if (onlineSitemapUrl == null) {
            return null;
        }
        byte[] bytes = IOUtils.toByteArray(onlineSitemapUrl);
        return this.parseSiteMap(bytes, onlineSitemapUrl);
    }

    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, AbstractSiteMap sitemap) throws UnknownFormatException, IOException {
        AbstractSiteMap asmCopy = this.parseSiteMap(contentType, content, sitemap.getUrl());
        asmCopy.setLastModified(sitemap.getLastModified());
        return asmCopy;
    }

    public AbstractSiteMap parseSiteMap(byte[] content, URL url) throws UnknownFormatException, IOException {
        if (url == null) {
            return null;
        }
        String contentType = this.mimeTypeDetector.detect(content);
        if (contentType == null) {
            String msg = String.format(Locale.ROOT, "Failed to detect MediaType of sitemap '%s'", url);
            throw new UnknownFormatException(msg);
        }
        return this.parseSiteMap(contentType, content, url);
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
        String msg;
        String mimeType = this.mimeTypeDetector.normalize(contentType, content);
        if (this.mimeTypeDetector.isXml(mimeType)) {
            return this.processXml(url, content);
        }
        if (this.mimeTypeDetector.isText(mimeType)) {
            return this.processText(url, content);
        }
        if (!this.mimeTypeDetector.isGzip(mimeType)) {
            msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' from '%s'", contentType, url);
            throw new UnknownFormatException(msg);
        }
        try (BufferedInputStream decompressed = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(content)));){
            String compressedType = this.mimeTypeDetector.detect(decompressed);
            if (this.mimeTypeDetector.isXml(compressedType)) {
                AbstractSiteMap abstractSiteMap = this.processGzippedXML(url, content);
                return abstractSiteMap;
            }
            if (this.mimeTypeDetector.isText(compressedType)) {
                SiteMap siteMap = this.processText(url, decompressed);
                return siteMap;
            }
            if (compressedType == null) {
                msg = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
                throw new UnknownFormatException(msg);
            }
            msg = String.format(Locale.ROOT, "Can't parse a sitemap with MediaType '%s' (embedded in %s) from '%s'", compressedType, contentType, url);
            throw new UnknownFormatException(msg);
        }
        catch (Exception e) {
            String msg2 = String.format(Locale.ROOT, "Failed to detect embedded MediaType of gzipped sitemap '%s'", url);
            throw new UnknownFormatException(msg2, e);
        }
    }

    public void walkSiteMap(URL onlineSitemapUrl, Consumer<SiteMapURL> action) throws UnknownFormatException, IOException {
        if (onlineSitemapUrl == null || action == null) {
            LOG.info("Got null sitemap URL and/or action, stopping traversal");
            return;
        }
        this.walkSiteMap(this.parseSiteMap(onlineSitemapUrl), action);
    }

    public void walkSiteMap(AbstractSiteMap sitemap, Consumer<SiteMapURL> action) throws UnknownFormatException, IOException {
        if (sitemap == null || action == null) {
            LOG.info("Got null sitemap and/or action, stopping traversal");
            return;
        }
        if (sitemap.isIndex()) {
            Collection<AbstractSiteMap> links = ((SiteMapIndex)sitemap).getSitemaps();
            for (AbstractSiteMap asm : links) {
                if (asm == null) continue;
                this.walkSiteMap(asm.getUrl(), action);
            }
        } else {
            Collection<SiteMapURL> links = ((SiteMap)sitemap).getSiteMapUrls();
            for (SiteMapURL url : links) {
                if (url == null) continue;
                action.accept(url);
            }
        }
    }

    protected AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {
        SkipLeadingWhiteSpaceInputStream in = new SkipLeadingWhiteSpaceInputStream(new BOMInputStream(new ByteArrayInputStream(xmlContent)));
        InputSource is = new InputSource();
        is.setCharacterStream(new BufferedReader(new InputStreamReader((InputStream)in, StandardCharsets.UTF_8)));
        return this.processXml(sitemapUrl, is);
    }

    protected SiteMap processText(URL sitemapUrl, byte[] content) throws IOException {
        return this.processText(sitemapUrl, new ByteArrayInputStream(content));
    }

    protected SiteMap processText(URL sitemapUrl, InputStream stream) throws IOException {
        String line;
        LOG.debug("Processing textual Sitemap");
        SiteMap textSiteMap = new SiteMap(sitemapUrl);
        textSiteMap.setType(AbstractSiteMap.SitemapType.TEXT);
        BOMInputStream bomIs = new BOMInputStream(stream);
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)bomIs, StandardCharsets.UTF_8));
        int i = 0;
        while ((line = reader.readLine()) != null && ++i <= 50000) {
            if ((line = line.trim()).isEmpty()) continue;
            String urlFiltered = this.urlFilter.apply(line);
            if (urlFiltered == null) {
                LOG.info("Filtered url: [{}]", (Object)line.substring(0, Math.min(1024, line.length())));
                continue;
            }
            try {
                URL url = new URI(urlFiltered).toURL();
                boolean valid = SiteMapParser.urlIsValid(textSiteMap.getBaseUrl(), url.toString());
                if (valid || !this.strict) {
                    SiteMapURL sUrl = new SiteMapURL(url, valid);
                    textSiteMap.addSiteMapUrl(sUrl);
                    LOG.debug("  {}. {}", (Object)i, (Object)sUrl);
                    continue;
                }
                LOG.info("URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", (Object)url.toExternalForm(), (Object)textSiteMap.getBaseUrl());
            }
            catch (IllegalArgumentException | MalformedURLException | URISyntaxException e) {
                LOG.warn("Bad url: [{}]", (Object)line.substring(0, Math.min(1024, line.length())));
            }
        }
        textSiteMap.setProcessed(true);
        return textSiteMap;
    }

    protected AbstractSiteMap processGzippedXML(URL url, byte[] response) throws IOException, UnknownFormatException {
        LOG.debug("Processing gzipped XML");
        ByteArrayInputStream is = new ByteArrayInputStream(response);
        String xmlUrl = url.toString().replaceFirst("\\.gz$", "");
        LOG.debug("XML url = {}", (Object)xmlUrl);
        SkipLeadingWhiteSpaceInputStream decompressed = new SkipLeadingWhiteSpaceInputStream(new BOMInputStream(new GZIPInputStream(is)));
        InputSource in = new InputSource(decompressed);
        in.setSystemId(xmlUrl);
        return this.processXml(url, in);
    }

    protected AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
        SAXParserFactory factory2 = SAXParserFactory.newInstance();
        factory2.setValidating(false);
        factory2.setXIncludeAware(false);
        factory2.setNamespaceAware(true);
        try {
            factory2.setFeature("http://xml.org/sax/features/external-general-entities", false);
            factory2.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
            factory2.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
            if (!this.allowDocTypeDefinitions) {
                factory2.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
            }
        }
        catch (Exception e) {
            throw new RuntimeException("Failed to configure XML parser: " + e.toString());
        }
        DelegatorHandler handler = new DelegatorHandler(sitemapUrl, this.strict);
        handler.setStrictNamespace(this.isStrictNamespace());
        if (this.isStrictNamespace()) {
            handler.setAcceptedNamespaces(this.acceptedNamespaces);
        }
        handler.setExtensionNamespaces(this.extensionNamespaces);
        handler.setURLFilter(this.urlFilter);
        try {
            SAXParser saxParser = factory2.newSAXParser();
            saxParser.getXMLReader().setEntityResolver(new EntityResolver(){

                @Override
                public InputSource resolveEntity(String publicId, String systemId) {
                    return new InputSource(new StringReader(""));
                }
            });
            saxParser.parse(is, (DefaultHandler)handler);
            AbstractSiteMap sitemap = handler.getSiteMap();
            if (sitemap == null) {
                UnknownFormatException ex = handler.getException();
                if (ex != null) {
                    throw ex;
                }
                throw new UnknownFormatException("Unknown XML format for: " + String.valueOf(sitemapUrl));
            }
            return sitemap;
        }
        catch (IOException e) {
            LOG.warn("Error parsing sitemap {}: {}", (Object)sitemapUrl, (Object)e.getMessage());
            UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + String.valueOf(sitemapUrl));
            ufe.initCause(e);
            throw ufe;
        }
        catch (SAXException e) {
            LOG.warn("Error parsing sitemap {}: {}", (Object)sitemapUrl, (Object)e.getMessage());
            AbstractSiteMap sitemap = handler.getSiteMap();
            if (this.allowPartial && sitemap != null) {
                LOG.warn("Processed broken/partial sitemap for '{}'", (Object)sitemapUrl);
                sitemap.setProcessed(true);
                return sitemap;
            }
            UnknownFormatException ufe = new UnknownFormatException("Failed to parse " + String.valueOf(sitemapUrl));
            ufe.initCause(e);
            throw ufe;
        }
        catch (ParserConfigurationException e) {
            throw new IllegalStateException(e);
        }
    }

    public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
        return testUrl.startsWith(sitemapBaseUrl);
    }
}

