/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MetadataHandler {
    private static final Logger logger = LoggerFactory.getLogger(MetadataHandler.class);
    private static final String metaAccessName = "name=\"DC.(?:Access)?Rights\"";
    private static final String metaAccessContent = "content=\"([^\"]+)\"";
    public static final Pattern META_RESTRICTED_ACCESS_RIGHTS = Pattern.compile("<(?:(?i)meta)(?:[^<]*name=\"DC.(?:Access)?Rights\"[^<]*content=\"([^\"]+)\"|[^<]*content=\"([^\"]+)\"[^<]*name=\"DC.(?:Access)?Rights\")[^>]*[/]?>", 2);
    public static final Pattern NO_ACCESS_RIGHTS = Pattern.compile(".*(?:(close[d]?|embargo(?:ed)?|restrict(?:ed)?|metadata(?:\\s|%20|-|_)*only|paid)(?:(?:\\s|%20|-|_)*access)?|(?:no[t]?|nen\u00ed)(?:\\s|%20|-|_)*(?:accessible|p\u0159\u00edstupn\u00e1)|inaccessible|(?:acceso(?:\\s|%20|-|_)*)?cerrado).*");
    private static final String metaName = "name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\"";
    private static final String metaContent = "content=\"(http[^\"]+)\"";
    public static final Pattern META_DOC_URL = Pattern.compile("<meta(?:[^<]*name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\"[^<]*content=\"(http[^\"]+)\"|[^<]*content=\"(http[^\"]+)\"[^<]*name=\"(?:[^<]*(?:(?:citation|wkhealth)(?:_fulltext)?_)?pdf|eprints.document)_url\")[^>]*[/]?>", 2);
    public static Pattern COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS;
    public static Pattern LOCALHOST_DOMAIN_REPLACEMENT_PATTERN;
    public static AtomicInteger numOfProhibitedAccessPagesFound;
    public static AtomicInteger numOfMetaDocUrlsFound;

    public static boolean checkAndHandleMetadata(String urlId, String sourceUrl, String pageUrl, String pageDomain, String pageHtml) {
        String lowercaseMetaAccessRights;
        Matcher noAccessRightsMatcher;
        String metaAccessRights = null;
        metaAccessRights = MetadataHandler.getMetaAccessRightsFromHTML(pageHtml);
        if (metaAccessRights == null) {
            if (logger.isTraceEnabled()) {
                logger.trace("Could not retrieve the metaAccessRights for url \"" + pageUrl + "\", continue by checking the metaDocUrl..");
            }
        } else if (logger.isTraceEnabled()) {
            logger.trace("metaAccessRights: " + metaAccessRights);
        }
        if (metaAccessRights != null && (noAccessRightsMatcher = NO_ACCESS_RIGHTS.matcher(lowercaseMetaAccessRights = metaAccessRights.toLowerCase())).matches()) {
            String noAccessCase = noAccessRightsMatcher.group(1);
            if (noAccessCase == null || noAccessCase.isEmpty()) {
                noAccessCase = "prohibited";
            }
            logger.debug("The metaAccessRights were found to be \"" + noAccessCase + "\"! Do not check the metaDocUrl, nor crawl the page!");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its accessRight were '" + noAccessCase + "'.", "null", null, true, "true", "true", "false", "false", "true", null, "null", "null");
            numOfProhibitedAccessPagesFound.incrementAndGet();
            return true;
        }
        if (!ArgsUtils.retrieveDocuments) {
            return false;
        }
        String metaDocUrl = null;
        metaDocUrl = MetadataHandler.getMetaDocUrlFromHTML(pageHtml);
        if (metaDocUrl == null) {
            if (logger.isTraceEnabled()) {
                logger.trace("Could not retrieve the metaDocUrl, continue by crawling the page..");
            }
            return false;
        }
        if (logger.isTraceEnabled()) {
            logger.trace("MetaDocUrl: " + metaDocUrl);
        }
        if (metaDocUrl.equals(pageUrl) || ConnSupportUtils.haveOnlyProtocolDifference(pageUrl, metaDocUrl) || ConnSupportUtils.isJustASlashRedirect(pageUrl, metaDocUrl)) {
            logger.debug("The metaDocUrl was found to be the same as the pageUrl! Continue by crawling the page..");
            return false;
        }
        if (metaDocUrl.contains("{{") || metaDocUrl.contains("<?")) {
            if (logger.isTraceEnabled()) {
                logger.trace("The metaDocUrl is a dynamic-link. Abort the process and block the domain of the pageUrl.");
            }
            HttpConnUtils.blacklistedDomains.add(pageDomain);
            logger.warn("Domain: \"" + pageDomain + "\" was blocked, after giving a dynamic metaDocUrl: " + metaDocUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl was a dynamic-link.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            PageCrawler.contentProblematicUrls.incrementAndGet();
            return true;
        }
        String lowerCaseMetaDocUrl = metaDocUrl.toLowerCase();
        if (ArgsUtils.shouldDownloadDocFiles && UrlTypeChecker.CURRENTLY_UNSUPPORTED_DOC_EXTENSION_FILTER.matcher(lowerCaseMetaDocUrl).matches() || UrlTypeChecker.PLAIN_PAGE_EXTENSION_FILTER.matcher(lowerCaseMetaDocUrl).matches() || UrlTypeChecker.URL_DIRECTORY_FILTER.matcher(lowerCaseMetaDocUrl).matches() || COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS.matcher(lowerCaseMetaDocUrl).matches()) {
            logger.warn("The retrieved metaDocUrl ( " + metaDocUrl + " ) is pointing to an unsupported file, continue by crawling the page..");
            return false;
        }
        if (PageCrawler.NON_VALID_DOCUMENT.matcher(lowerCaseMetaDocUrl).matches()) {
            logger.warn("The retrieved metaDocUrl ( " + metaDocUrl + " ) is pointing to a false-positive full-text file, avoid crawling the page..!");
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl is pointing to a false-positive full-text file.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
            return true;
        }
        String tempMetaDocUrl = metaDocUrl;
        if ((metaDocUrl = ConnSupportUtils.getFullyFormedUrl(pageUrl, metaDocUrl, null)) == null || (metaDocUrl = LoaderAndChecker.basicURLNormalizer.filter(metaDocUrl)) == null) {
            logger.warn("Could not normalize metaDocUrl: " + tempMetaDocUrl + " , continue by crawling the page..");
            return false;
        }
        IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.resultUrlsWithIDs.get(metaDocUrl = LOCALHOST_DOMAIN_REPLACEMENT_PATTERN.matcher(metaDocUrl).replaceFirst("://" + pageDomain));
        if (originalIdUrlMimeTypeTriple != null) {
            ConnSupportUtils.handleReCrossedTargetUrl(urlId, sourceUrl, pageUrl, metaDocUrl, originalIdUrlMimeTypeTriple, false);
            numOfMetaDocUrlsFound.incrementAndGet();
            return true;
        }
        try {
            if (HttpConnUtils.connectAndCheckMimeType(urlId, sourceUrl, pageUrl, metaDocUrl, pageDomain, false, true)) {
                numOfMetaDocUrlsFound.incrementAndGet();
                return true;
            }
            logger.warn("The retrieved metaDocUrl was NOT a docUrl (unexpected): " + metaDocUrl + " , continue by crawling the page..");
            return false;
        }
        catch (DomainBlockedException dbe) {
            String metaDocUrlDomain = UrlUtils.getDomainStr(metaDocUrl, null);
            if (metaDocUrlDomain != null && metaDocUrlDomain.equals(pageDomain)) {
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its domain was blocked.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
                return true;
            }
            return false;
        }
        catch (Exception e) {
            String exceptionMessage;
            if (e instanceof RuntimeException && (exceptionMessage = e.getMessage()) != null && (exceptionMessage.contains("HTTP 401") || exceptionMessage.contains("HTTP 403"))) {
                logger.warn("The MetaDocUrl < " + metaDocUrl + " > had authorization issues, so further crawling of this page is aborted.");
                UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'MetaDocUrlsHandler.checkIfAndHandleMetaDocUrl()' method, as its metaDocUrl had authorization issues.", "null", null, true, "true", "true", "false", "false", "false", null, "null", "null");
                return true;
            }
            logger.warn("The MetaDocUrl < " + metaDocUrl + " > had connectivity or redirection problems! Continue by crawling the page..");
            return false;
        }
    }

    public static String getMetaAccessRightsFromHTML(String pageHtml) {
        Matcher metaAccessRightsMatcher = META_RESTRICTED_ACCESS_RIGHTS.matcher(pageHtml);
        StringBuilder stringBuilder = new StringBuilder(500);
        while (metaAccessRightsMatcher.find()) {
            String currentMetaAccessRights = null;
            try {
                currentMetaAccessRights = metaAccessRightsMatcher.group(1);
            }
            catch (Exception e) {
                logger.error("", e);
            }
            if (currentMetaAccessRights == null) {
                try {
                    currentMetaAccessRights = metaAccessRightsMatcher.group(2);
                }
                catch (Exception e) {
                    logger.error("", e);
                }
            }
            if (currentMetaAccessRights == null || currentMetaAccessRights.startsWith("http") || currentMetaAccessRights.length() > 200) continue;
            stringBuilder.append(currentMetaAccessRights).append(" -- ");
        }
        if (stringBuilder.length() == 0) {
            return null;
        }
        return stringBuilder.toString();
    }

    public static String getMetaDocUrlFromHTML(String pageHtml) {
        Matcher metaDocUrlMatcher = META_DOC_URL.matcher(pageHtml);
        if (!metaDocUrlMatcher.find()) {
            return null;
        }
        String metaDocUrl = null;
        try {
            metaDocUrl = metaDocUrlMatcher.group(1);
        }
        catch (Exception e) {
            logger.error("", e);
        }
        if (metaDocUrl == null) {
            try {
                metaDocUrl = metaDocUrlMatcher.group(2);
            }
            catch (Exception e) {
                logger.error("", e);
            }
        }
        return metaDocUrl;
    }

    static {
        Object regex = ".+\\.(?:";
        if (!ArgsUtils.retrieveDatasets) {
            regex = (String)regex + "(?:xls[xbm]?|xlt[x]?|[ct]sv|tab|(?:(?:geo)?j|b)son|(?:x|k|g|nmr|sb|wiley|y[a]?)ml|xsd|o[dt]s|ddi|rdf|[g]?zip|zipx|[rt]ar|[7x]z|[t]?gz|[gb]z[\\d]*|smi[l]?|por|ascii|dta|sav|dat|txt|ti[f]{1,2}|tfw|dwg|nt|fits|feather|svg|sas7b(?:dat|ve)|spss|sas|stata|(?:my|postgre)?sql(?:ite)?|bigquery|sh[px]|sb[xn]|prj|dbf|(?:m|acc)db|mif|mat|pcd|bt|n[sc]?[\\d]*|h[\\d]+|hdf[\\d]*|trs|opj|jcamp|fcs|fas(?:ta)?|keys|values|las|rdata|parquet|avro|sql|dcm|gr[i]?b]|rds|[p]?cap|dmp|vcf|cbor|biosample|hic|warc|ig[e]?s|sla|dxf|pdb|[sc]df|cif|f(?:ast)?[qa]|apng|sra|vtp|gltf|[sm]tl|ply|abc|md|rtf|ttl|shp|shx|exr|cdf|glb|mtl|kmz|textFile)";
        } else if (!ArgsUtils.retrieveDocuments) {
            regex = (String)regex + "pdf|(?:(?:doc|ppt)[x]?|ps|epub|od[tp]|djvu|rtf)";
        }
        regex = (String)regex + "|apk|jpg|png)(?:\\?.+)?$";
        logger.debug("COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS -> REGEX: " + (String)regex);
        COMMON_UNSUPPORTED_META_DOC_OR_DATASET_URL_EXTENSIONS = Pattern.compile((String)regex);
        LOCALHOST_DOMAIN_REPLACEMENT_PATTERN = Pattern.compile("://(?:localhost|127.0.0.1)(?:\\:[\\d]+)?");
        numOfProhibitedAccessPagesFound = new AtomicInteger(0);
        numOfMetaDocUrlsFound = new AtomicInteger(0);
    }
}

