/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.util.url;

import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class UrlTypeChecker {
    private static final Logger logger = LoggerFactory.getLogger(UrlTypeChecker.class);
    private static final String htOrPhpExtensionsPattern = "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})";
    private static final String mediaExtensionsPattern = "ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov";
    private static final String docOrDatasetKeywords = "(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)";
    private static final String wordsPattern = "[\\w/_.,-]{0,100}";
    private static final String docOrDatasetNegativeLookAroundPattern = "(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)";
    public static Pattern URL_DIRECTORY_FILTER = null;
    public static final String unsupportedDocFileTypes = "(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)";
    public static final Pattern CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER = Pattern.compile(".+\\.(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)(?:\\?.+)?$");
    public static final Pattern URL_FILE_EXTENSION_FILTER = Pattern.compile(".+\\.(?:css|js(?:\\?y)?|ico|gif|jpg|jpeg|png|wav|mp3|mp4|webm|mkv|mov|pt|bib|nt|refer|enw|ris|mso|dtl|do|asc|c|cc(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)|cxx|cpp|java|py)(?:\\?.+)?$");
    public static final Pattern INTERNAL_LINKS_KEYWORDS_FILTER = Pattern.compile(".*(?:doi.org|\\?l[a]?n[g]?=|isallowed=n|site=|link(?:out|listener)|login).*");
    public static final Pattern PLAIN_PAGE_EXTENSION_FILTER = Pattern.compile(".+(?<!(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset))\\.(?:(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})|[aj]sp[x]?|jsf|do|asc|cgi|cfm)(?:\\?(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)).*)?$");
    public static Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = null;
    public static Pattern SPECIFIC_DOMAIN_FILTER = null;
    public static final Pattern PLAIN_DOMAIN_FILTER = Pattern.compile("^https?://[\\w.:-]+(?:/[\\w]{2})?(?:/index.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2}))?[/]?(?:\\?(?:locale(?:-attribute)?|ln)=[\\w_-]+)?$");
    public static AtomicInteger javascriptPageUrls = new AtomicInteger(0);
    public static AtomicInteger crawlerSensitiveDomains = new AtomicInteger(0);
    public static AtomicInteger doajResultPageUrls = new AtomicInteger(0);
    public static AtomicInteger pagesWithHtmlDocUrls = new AtomicInteger(0);
    public static AtomicInteger pagesRequireLoginToAccessDocFiles = new AtomicInteger(0);
    public static AtomicInteger pagesWithLargerCrawlingDepth = new AtomicInteger(0);
    public static AtomicInteger longToRespondUrls = new AtomicInteger(0);
    public static AtomicInteger urlsWithUnwantedForm = new AtomicInteger(0);
    public static AtomicInteger pangaeaUrls = new AtomicInteger(0);
    public static AtomicInteger pagesNotProvidingDocUrls = new AtomicInteger(0);

    public static void setRuntimeInitializedRegexes() {
        URL_DIRECTORY_FILTER = Pattern.compile("^https?://.*/(?:(?:(?:(?:discover|profile|user|survey|index|media|theme|product|deposit|default|shop|view)/(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)|(?:(?:ldap|password)-)?login|ac[c]?ess(?![./]+)|sign[-]?(?:in|out|up)|session|(?:how-to-)?(?:join[^t]|subscr)|authwall|regist(?:er|ration)|submi(?:t|ssion)|(?:post|send|export|(?:wp-)?admin|home|form|career[s]?|company)/|watch|browse|import|bookmark|announcement|feedback|share[^d]|about|(?:[^/]+-)?faq|wiki|news|events|cart|support|(?:site|html)map|documentation|help|license|disclaimer|copyright|(?:site-)?polic(?:y|ies)(?!.*paper)|privacy|terms|law|principles|(?:my|your|create)?[-]?account|my(?:dspace|selection|cart)|(?:service|help)[-]?desk|settings|fund|aut[h]?or(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)|journal/key|(?:journal-)?editor|author:|(?<!ntrs.nasa.gov/(?:api/)?)citation|review|external|facets|statistics|application|selfarchive|permission|ethic(s)?/.*/view/|/view/(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)|conta[c]?t|wallet|contribute|donate|our[_-][\\w]+|template|logo|image|photo/|video|advertiser|most-popular|people|(?:the)?press|for-authors|customer-service[s]?|captcha|clipboard|dropdown|widget|(?:forum|blog|column|row|js|[cr]ss|legal)/|(?:(?:advanced[-]?)?search|search/advanced|search-results|(?:[e]?books|journals)(?:-catalog)?|issue|docs|oai|(?:abstracting-)?indexing|online[-]?early|honors|awards|meetings|calendar|diversity|scholarships|invo(?:ice|lved)|errata|classroom|publish(?:-with-us)?|upload|products|forgot|home|ethics|comics|podcast|trends|bestof|booksellers|recommendations|bibliographic|volume[s]?)[/]?$|rights[-]?permissions|publication[-]?ethics|advertising|reset[-]?password|\\*/|communit(?:y|ies)|restricted|noaccess|crawlprevention|error|(?:mis|ab)use|\\?denied|gateway|defaultwebpage|sorryserver|(?<!response_type=)cookie|(?:page[-]?)?not[-]?found|(?:(?:error)?404(?:_response)?|accessibility|invalid|catalog(?:ue|ar|o)?)\\.(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})|(.*sharedsitesession)" + (String)(ArgsUtils.shouldJustDownloadHtmlFiles ? "" : "|(.*/view/(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*))|(doi.org/https://doi.org/.*pangaea." + (!ArgsUtils.retrieveDatasets ? "|pangaea.)" : ")")) + (!ArgsUtils.retrieveDatasets ? ").*)|(?:bibtext|dc(?:terms)?|[^/]*(?:tei|endnote))$)" : ")).*$)"));
        if (logger.isTraceEnabled()) {
            logger.trace("URL_DIRECTORY_FILTER:\n" + String.valueOf(URL_DIRECTORY_FILTER));
        }
        SPECIFIC_DOMAIN_FILTER = Pattern.compile("^https?://[^/]*(?<=[/.])(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|twitter|(?:meta|xing|baidu|t|x|vk).co|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr|www.ccdc.cam.ac.uk|figshare.com/collections/|datadryad.org/stash/dataset/|evernote|skype|microsoft|adobe|buffer|digg|stumbleupon|addthis|delicious|dailymotion|gostats|blog(?:ger)?|copyright|friendfeed|newsvine|telegram|getpocket|flipboard|line.me|ok.rudouban|qzone|renren|weibo|doubleclick|bit.ly|github|reviewofbooks|plu.mx|(?<!files.)wordpress|orcid.org|auth(?:oriz(?:e|ation)|entication)?\\." + (ArgsUtils.shouldJustDownloadHtmlFiles ? "" : "|(?<!manuscript.)elsevier.com|sciencedirect.com|(?:static|multimedia|tienda).elsevier.|arvojournals.org|books.openedition.org") + "|perfdrive.|services.bepress.com|(?:careers|shop).|myworkdayjobs.com|editorialmanager.com|(tandfonline.com|persee.fr|papers.ssrn.com|documentation.ird.fr|library.unisa.edu.au|publications.cnr.it)|(doaj.org/toc/)" + (ArgsUtils.shouldJustDownloadHtmlFiles ? "" : "|(dlib.org|saberes.fcecon.unr.edu.ar|eumed.net)|(rivisteweb.it|wur.nl|remeri.org.mx|cam.ac.uk|scindeks.ceon.rs|egms.de)|(bibliotecadigital.uel.br|cepr.org)|(scielosp.org(?<![\\w/_.,-]{0,100}(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset)[\\w/_.,-]{0,100})(?!.*(?:(?:pdf|download|/doc|document|(?:/|[?]|&)file|/(?:fulltext|texteint[\u00e9e]gral)|attachment|/paper|view(?:file|doc)|/get|cgi/viewcontent.cgi\\?|t[\u00e9e]l[\u00e9e]charger|descargar)|dataset).*)|cepr.org|dk.um.si|apospublications.com|jorr.org|rwth-aachen.de|pubmed.ncbi.nlm.nih.gov)") + "|(200.17.137.108))[^/]*/.*$");
        if (logger.isTraceEnabled()) {
            logger.trace("SPECIFIC_DOMAIN_FILTER:\n" + String.valueOf(SPECIFIC_DOMAIN_FILTER));
        }
        INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:" + (!ArgsUtils.retrieveDatasets ? "xml|" : "") + "(?:[\\w]?ht(?:[\\w]{1,2})?|php[\\d]{0,2})|rss|ris|bib|citation_|events_kml).*");
    }

    public static boolean shouldNotAcceptPageUrl(String urlId, String sourceUrl, String pageUrl, String lowerCaseUrl, boolean calledForPageUrl) {
        if (lowerCaseUrl == null) {
            lowerCaseUrl = pageUrl.toLowerCase();
        }
        Object loggingMessage = null;
        String wasUrlValid = "null";
        String groupMatch = null;
        String patternToMatch = URL_DIRECTORY_FILTER.pattern();
        try {
            Matcher matcher = URL_DIRECTORY_FILTER.matcher(lowerCaseUrl);
            if (matcher.matches()) {
                if (calledForPageUrl) {
                    groupMatch = matcher.group(1);
                    if (groupMatch != null && !groupMatch.isEmpty()) {
                        ConnSupportUtils.blockSharedSiteSessionDomains(pageUrl, null);
                        loggingMessage = "It was discarded after participating in a 'sharedSiteSession-endlessRedirectionPack': '" + groupMatch + "'.";
                        LoaderAndChecker.connProblematicUrls.incrementAndGet();
                    } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(2)) != null && !groupMatch.isEmpty()) {
                        loggingMessage = "Discarded after matching to a site having its DocUrls in larger depth: '" + groupMatch + "'.";
                        pagesWithLargerCrawlingDepth.incrementAndGet();
                    } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(3)) != null && !groupMatch.isEmpty()) {
                        loggingMessage = "Discarded after matching to a 'PANGAEA.' url with invalid form and non-docUrls in their internal links: '" + groupMatch + "'.";
                        pangaeaUrls.incrementAndGet();
                    } else {
                        loggingMessage = "Discarded after matching to a directory with problems.";
                    }
                    logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                }
                return true;
            }
            patternToMatch = SPECIFIC_DOMAIN_FILTER.pattern();
            matcher = SPECIFIC_DOMAIN_FILTER.matcher(lowerCaseUrl);
            if (matcher.matches()) {
                if (calledForPageUrl) {
                    groupMatch = matcher.group(1);
                    if (groupMatch != null && !groupMatch.isEmpty()) {
                        loggingMessage = "Discarded after matching to a JavaScript-using domain, other than the 'sciencedirect.com': '" + groupMatch + "'.";
                        javascriptPageUrls.incrementAndGet();
                    } else {
                        groupMatch = matcher.group(2);
                        if (groupMatch != null && !groupMatch.isEmpty()) {
                            loggingMessage = "Discarded after matching to the Results-directory: 'doaj.org/toc/': '" + groupMatch + "'.";
                            doajResultPageUrls.incrementAndGet();
                        } else {
                            groupMatch = matcher.group(3);
                            if (groupMatch != null && !groupMatch.isEmpty()) {
                                loggingMessage = "Discarded after matching to a site containing the full-text as plain-text inside its HTML: '" + groupMatch + "'.";
                                pagesWithHtmlDocUrls.incrementAndGet();
                            } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(4)) != null && !groupMatch.isEmpty()) {
                                loggingMessage = "Discarded after matching to a domain which doesn't provide docUrls: '" + groupMatch + "'.";
                                pagesNotProvidingDocUrls.incrementAndGet();
                            } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(5)) != null && !groupMatch.isEmpty()) {
                                loggingMessage = "Discarded after matching to a domain which needs login to access docFiles: '" + groupMatch + "'.";
                                pagesRequireLoginToAccessDocFiles.incrementAndGet();
                            } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(6)) != null && !groupMatch.isEmpty()) {
                                loggingMessage = "Discarded after matching to a site having its DocUrls in larger depth: '" + groupMatch + "'.";
                                pagesWithLargerCrawlingDepth.incrementAndGet();
                            } else if (!ArgsUtils.shouldJustDownloadHtmlFiles && (groupMatch = matcher.group(7)) != null && !groupMatch.isEmpty()) {
                                loggingMessage = "Discarded after matching to known domains with connectivity problems: '" + groupMatch + "'.";
                                LoaderAndChecker.connProblematicUrls.incrementAndGet();
                            } else {
                                loggingMessage = "Discarded after matching to a domain with problems.";
                            }
                        }
                    }
                    logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                }
                return true;
            }
            patternToMatch = PageCrawler.NON_VALID_DOCUMENT.pattern();
            matcher = PageCrawler.NON_VALID_DOCUMENT.matcher(lowerCaseUrl);
            if (matcher.matches()) {
                if (calledForPageUrl) {
                    loggingMessage = "Discarded after matching to a url leading to an invalid document!";
                    logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                    PageCrawler.contentProblematicUrls.incrementAndGet();
                }
                return true;
            }
            patternToMatch = PLAIN_DOMAIN_FILTER.pattern();
            matcher = PLAIN_DOMAIN_FILTER.matcher(lowerCaseUrl);
            if (matcher.matches()) {
                if (calledForPageUrl) {
                    loggingMessage = "Discarded after matching to a url having only the domain part!";
                    logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                }
                return true;
            }
            patternToMatch = URL_FILE_EXTENSION_FILTER.pattern();
            matcher = URL_FILE_EXTENSION_FILTER.matcher(lowerCaseUrl);
            if (matcher.matches()) {
                if (calledForPageUrl) {
                    loggingMessage = "Discarded after matching to a url having an irrelevant extension!";
                    logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                    UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                }
                return true;
            }
            if (ArgsUtils.shouldDownloadDocFiles) {
                patternToMatch = CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.pattern();
                matcher = CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(lowerCaseUrl);
                if (matcher.matches()) {
                    if (calledForPageUrl) {
                        loggingMessage = "Discarded after matching to a url having an unsupported document extension!";
                        logger.debug("Url-\"" + pageUrl + "\": " + (String)loggingMessage);
                        UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", (String)loggingMessage, "null", null, true, "true", wasUrlValid, "false", "false", "false", null, "null", "null");
                    }
                    return true;
                }
            }
        }
        catch (Exception e) {
            logger.error("Error when evaluating url \"" + lowerCaseUrl + "\" with regex: " + patternToMatch, e);
            return true;
        }
        return false;
    }

    public static boolean shouldNotAcceptInternalLink(String linkStr, String lowerCaseLink) {
        if (lowerCaseLink == null) {
            lowerCaseLink = linkStr.toLowerCase();
        }
        try {
            return UrlTypeChecker.shouldNotAcceptPageUrl(null, null, linkStr, lowerCaseLink, false) || INTERNAL_LINKS_KEYWORDS_FILTER.matcher(lowerCaseLink).matches() || INTERNAL_LINKS_FILE_FORMAT_FILTER.matcher(lowerCaseLink).matches() || PLAIN_PAGE_EXTENSION_FILTER.matcher(lowerCaseLink).matches();
        }
        catch (Exception e) {
            logger.error("Error when evaluating url \"" + lowerCaseLink + "\" with regexes of \"UrlTypeChecker.shouldNotAcceptInternalLink()\".", e);
            return true;
        }
    }
}

