package eu.dnetlib.data.utility.resource_discovery.crawler.config;

import eu.dnetlib.data.utility.resource_discovery.url_filter.UrlFilter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.util.Vector;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.HttpException;
import net.matuschek.http.URLLogger;
import net.matuschek.spider.WebRobot;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dlese.dpc.xml.XMLDoc;
import org.dlese.dpc.xml.XMLException;

/* loaded from: input_file:eu/dnetlib/data/utility/resource_discovery/crawler/config/Configs.class */
public class Configs {
    private static final Log logger = LogFactory.getLog(Configs.class);
    public static String agentName;
    public static boolean ignoreRobotsTxt;
    public static int sleepTime;
    public static int maxDepth;
    public static boolean walkToOtherHosts;
    public static boolean allowWholeHost;
    public static boolean allowWholeDomain;
    public static boolean flexibleHostCheck;
    public static boolean localizeLinks;
    public static boolean enableCookies;
    public static String startReferer;
    public static int maxDocumentAge;
    public static String[] allowedUrl;
    public static String[] visitMany;
    public static String proxy;
    public static int bandwidth;

    private static String readXMLDoc(String str) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(Configs.class.getResourceAsStream(str)));
        StringBuilder sb = new StringBuilder();
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    return sb.toString();
                }
                sb.append(readLine);
            } finally {
                bufferedReader.close();
            }
        }
    }

    public static void configureCrawlerForValidation(WebRobot webRobot) throws IOException {
        logger.debug("Configuring crawler for validation");
        webRobot.setAgentName("Validator");
        webRobot.setIgnoreRobotsTxt(false);
        webRobot.setSleepTime(1);
        webRobot.setMaxDepth(1);
        webRobot.setWalkToOtherHosts(false);
        webRobot.setAllowWholeHost(true);
        webRobot.setAllowWholeDomain(true);
        webRobot.setFlexibleHostCheck(true);
        webRobot.setEnableCookies(true);
        DownloadRuleSet downloadRuleSet = new DownloadRuleSet();
        downloadRuleSet.addRule("text", "html", 1, 104857600, true);
        for (String str : UrlFilter.getRequestedMimeTypes()) {
            String[] split = str.split("/");
            if (split.length < 2) {
                logger.debug("WARNING: Requested mimetype " + str + " seems to be malformed");
                throw new IOException();
            }
            downloadRuleSet.addRule(split[0], split[1], 1, 104857600, true);
        }
        downloadRuleSet.addRule("*", "*", 1, 104857600, false);
        webRobot.setDownloadRuleSet(downloadRuleSet);
    }

    public static void configureCrawler(WebRobot webRobot) throws IOException, HttpException {
        logger.debug("Configuring crawler using configuration file parameters");
        webRobot.setAgentName(agentName);
        webRobot.setIgnoreRobotsTxt(ignoreRobotsTxt);
        webRobot.setSleepTime(sleepTime / 1000);
        webRobot.setMaxDepth(maxDepth);
        webRobot.setWalkToOtherHosts(walkToOtherHosts);
        webRobot.setAllowWholeHost(allowWholeHost);
        webRobot.setAllowWholeDomain(allowWholeDomain);
        webRobot.setFlexibleHostCheck(flexibleHostCheck);
        webRobot.setEnableCookies(enableCookies);
        if (startReferer != null) {
            webRobot.setStartReferer(startReferer);
        }
        if (maxDocumentAge > 0) {
            webRobot.setMaxDocumentAge(maxDocumentAge);
        }
        if (allowedUrl != null) {
            Vector vector = new Vector();
            for (int i = 0; i < allowedUrl.length; i++) {
                vector.add(allowedUrl[i]);
            }
            webRobot.setAllowedURLs(vector);
        }
        if (visitMany != null) {
            Vector vector2 = new Vector();
            for (int i2 = 0; i2 < visitMany.length; i2++) {
                vector2.add(visitMany[i2]);
            }
            webRobot.setVisitMany(vector2);
        }
        if (proxy != null) {
            webRobot.setProxy(proxy);
        }
        if (bandwidth > 0) {
            webRobot.setBandwidth(bandwidth);
        }
        DownloadRuleSet downloadRuleSet = new DownloadRuleSet();
        downloadRuleSet.addRule("text", "html", 1, 104857600, true);
        for (String str : UrlFilter.getRequestedMimeTypes()) {
            String[] split = str.split("/");
            if (split.length < 2) {
                logger.debug("WARNING: Requested mimetype " + str + " seems to be malformed");
                throw new IOException();
            }
            downloadRuleSet.addRule(split[0], split[1], 1, 104857600, true);
        }
        downloadRuleSet.addRule("*", "*", 1, 104857600, false);
        webRobot.setDownloadRuleSet(downloadRuleSet);
    }

    public static void main(String[] strArr) {
        WebRobot webRobot = new WebRobot();
        try {
            configureCrawler(webRobot);
            StringWriter stringWriter = new StringWriter();
            webRobot.setDocManager(new URLLogger(stringWriter));
            webRobot.run();
            logger.debug(stringWriter.getBuffer().toString());
        } catch (Exception e) {
            logger.debug(e.getLocalizedMessage());
        }
    }

    static {
        agentName = "JoBo";
        ignoreRobotsTxt = false;
        sleepTime = 3000;
        maxDepth = 1;
        walkToOtherHosts = false;
        allowWholeHost = false;
        allowWholeDomain = false;
        flexibleHostCheck = true;
        localizeLinks = false;
        enableCookies = false;
        startReferer = null;
        maxDocumentAge = -1;
        allowedUrl = null;
        visitMany = null;
        proxy = null;
        bandwidth = -1;
        try {
            logger.debug("Reading configuration file for crawler");
            XMLDoc xMLDoc = new XMLDoc();
            xMLDoc.useXmlString(readXMLDoc("/eu/dnetlib/data/utility/resource_discovery/robot.xml"), true, true, true);
            String[] xmlFields = xMLDoc.getXmlFields(0, 1, "AgentName");
            String[] xmlFields2 = xMLDoc.getXmlFields(0, 1, "IgnoreRobotsTxt");
            String[] xmlFields3 = xMLDoc.getXmlFields(0, 1, "SleepTime");
            String[] xmlFields4 = xMLDoc.getXmlFields(0, 1, "MaxDepth");
            String[] xmlFields5 = xMLDoc.getXmlFields(0, 1, "WalkToOtherHosts");
            String[] xmlFields6 = xMLDoc.getXmlFields(0, 1, "AllowWholeHost");
            String[] xmlFields7 = xMLDoc.getXmlFields(0, 1, "AllowWholeDomain");
            String[] xmlFields8 = xMLDoc.getXmlFields(0, 1, "FlexibleHostCheck");
            String[] xmlFields9 = xMLDoc.getXmlFields(0, 1, "LocalizeLinks");
            String[] xmlFields10 = xMLDoc.getXmlFields(0, 1, "EnableCookies");
            String[] xmlFields11 = xMLDoc.getXmlFields(0, 1, "StartReferer");
            String[] xmlFields12 = xMLDoc.getXmlFields(0, 1, "MaxDocumentAge");
            String[] xmlFields13 = xMLDoc.getXmlFields(0, 0, "AllowedUrl");
            String[] xmlFields14 = xMLDoc.getXmlFields(0, 0, "VisitMany");
            String[] xmlFields15 = xMLDoc.getXmlFields(0, 1, "Proxy");
            String[] xmlFields16 = xMLDoc.getXmlFields(0, 1, "Bandwidth");
            if (xmlFields.length > 0) {
                agentName = xmlFields[0];
            }
            if (xmlFields2.length > 0) {
                ignoreRobotsTxt = Boolean.parseBoolean(xmlFields2[0]);
            }
            if (xmlFields3.length > 0) {
                sleepTime = Integer.parseInt(xmlFields3[0]) * 1000;
            }
            if (xmlFields4.length > 0) {
                maxDepth = Integer.parseInt(xmlFields4[0]);
            }
            if (xmlFields5.length > 0) {
                walkToOtherHosts = Boolean.parseBoolean(xmlFields5[0]);
            }
            if (xmlFields6.length > 0) {
                allowWholeHost = Boolean.parseBoolean(xmlFields6[0]);
            }
            if (xmlFields7.length > 0) {
                allowWholeDomain = Boolean.parseBoolean(xmlFields7[0]);
            }
            if (xmlFields8.length > 0) {
                flexibleHostCheck = Boolean.parseBoolean(xmlFields8[0]);
            }
            if (xmlFields9.length > 0) {
                localizeLinks = Boolean.parseBoolean(xmlFields9[0]);
            }
            if (xmlFields10.length > 0) {
                enableCookies = Boolean.parseBoolean(xmlFields10[0]);
            }
            if (xmlFields11.length > 0) {
                startReferer = xmlFields11[0];
            }
            if (xmlFields12.length > 0) {
                maxDocumentAge = Integer.parseInt(xmlFields12[0]);
            }
            if (xmlFields13.length > 0) {
                allowedUrl = xmlFields13;
            }
            if (xmlFields14.length > 0) {
                visitMany = xmlFields14;
            }
            if (xmlFields15.length > 0) {
                proxy = xmlFields15[0];
            }
            if (xmlFields16.length > 0) {
                bandwidth = Integer.parseInt(xmlFields16[0]);
            }
        } catch (XMLException e) {
            logger.debug("WARNING: The file robot.xml seems to be malformed. The default settings will be used for the crawler.", e);
        } catch (IOException e2) {
            logger.debug("Error reading robots.txt", e2);
        } catch (NumberFormatException e3) {
            logger.debug("WARNING: The file robot.xml seems to be malformed (an integer doesn't seem to be of type integer). The default settings will be used for the crawler.", e3);
        } catch (Exception e4) {
            logger.error("Error configuring", e4);
        }
    }
}
