/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.machine_learning;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkInvalidException;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.Collection;
import java.util.concurrent.atomic.AtomicInteger;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PageStructureMLA {
    private static final Logger logger = LoggerFactory.getLogger(PageStructureMLA.class);
    public static AtomicInteger structurePredictedDocLinks = new AtomicInteger(0);
    public static AtomicInteger structureValidatedDocLinks = new AtomicInteger(0);
    public static final SetMultimap<String, String> pagePathWithDocOrDatasetUrlStructure = Multimaps.synchronizedSetMultimap(HashMultimap.create());
    private static final int NUM_ELEMENTS_IN_STRUCTURE = 50;

    public static void addStructureOfDocUrlInMap(String pageUrl, String docLinkStructure) {
        String pagePath = UrlUtils.getPathStr(pageUrl, null);
        if (pagePath != null) {
            pagePathWithDocOrDatasetUrlStructure.put(pagePath, docLinkStructure);
        }
    }

    public static String getPageTagAndClassStructureForElement(Element element) {
        StringBuilder stringBuilder = new StringBuilder(1000);
        int elementsCount = 0;
        do {
            stringBuilder.append(element.tagName().trim());
            String className = element.className().trim();
            if (!className.isEmpty()) {
                stringBuilder.append("_").append(className);
            }
            stringBuilder.append(FileUtils.endOfLine);
        } while (++elementsCount <= 50 && (element = element.parent()) != null);
        return stringBuilder.toString();
    }

    public static void predictDocOrDatasetLink(String pageUrl, Elements elementLinksOnPage) throws DocLinkFoundException, DocLinkInvalidException {
        String pagePath = UrlUtils.getPathStr(pageUrl, null);
        if (pagePath == null) {
            return;
        }
        Collection storedStructuresForPagePath = pagePathWithDocOrDatasetUrlStructure.get((Object)pagePath);
        if (storedStructuresForPagePath.isEmpty()) {
            return;
        }
        for (Element el : elementLinksOnPage) {
            String docLink;
            String structure = PageStructureMLA.getPageTagAndClassStructureForElement(el);
            if (!storedStructuresForPagePath.contains(structure)) continue;
            if (logger.isTraceEnabled()) {
                logger.trace("Got a hit for pagePath \"" + pagePath + "\"!\n" + String.valueOf(el));
            }
            if ((docLink = PageCrawler.getInternalLink(pageUrl, el)) != null) {
                throw new DocLinkFoundException(docLink, structure, true);
            }
            logger.warn("No internal-" + ArgsUtils.targetUrlType + " could be extracted from the element!");
            return;
        }
        logger.warn("No " + ArgsUtils.targetUrlType + " was found, by comparing the html-structure of each element with previously stored ones, for pagePath: " + pagePath);
    }
}

