/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.pace.common;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;

public abstract class AbstractPaceFunctions {
    private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
    protected static Set<String> stopwords_gr = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
    protected static Set<String> stopwords_en = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> stopwords_de = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
    protected static Set<String> stopwords_es = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
    protected static Set<String> stopwords_fr = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
    protected static Set<String> stopwords_it = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
    protected static Set<String> stopwords_pt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
    protected static Transliterator transliterator = Transliterator.getInstance((String)"Any-Eng");
    protected static Set<String> ngramBlacklist = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
    public final String HTML_REGEX = "<[^>]*>";
    private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
    private static final String aliases_from = "\u2070\u00b9\u00b2\u00b3\u2074\u2075\u2076\u2077\u2078\u2079\u207a\u207b\u207c\u207d\u207e\u207f\u2080\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089\u208a\u208b\u208c\u208d\u208e\u00e0\u00e1\u00e2\u00e4\u00e6\u00e3\u00e5\u0101\u00e8\u00e9\u00ea\u00eb\u0113\u0117\u0119\u0259\u00ee\u00ef\u00ed\u012b\u012f\u00ec\u00f4\u00f6\u00f2\u00f3\u0153\u00f8\u014d\u00f5\u00fb\u00fc\u00f9\u00fa\u016b\u00df\u015b\u0161\u0142\u017e\u017a\u017c\u00e7\u0107\u010d\u00f1\u0144";
    private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
    public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
    private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
    private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
    protected static final FieldList EMPTY_FIELD = new FieldListImpl();

    protected String concat(List<String> l) {
        return Joiner.on((String)" ").skipNulls().join(l);
    }

    protected String cleanup(String s) {
        String s1 = s.replaceAll("<[^>]*>", "");
        String s2 = this.unicodeNormalization(s1.toLowerCase());
        String s3 = this.nfd(s2);
        String s4 = this.fixXML(s3);
        String s5 = s4.replaceAll("([0-9]+)", " $1 ");
        String s6 = AbstractPaceFunctions.transliterate(s5);
        String s7 = AbstractPaceFunctions.fixAliases(s6);
        String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
        String s9 = s8.replaceAll("[\\p{Punct}]", " ");
        String s10 = s9.replaceAll("\\n", " ");
        String s11 = s10.replaceAll("(?m)\\s+", " ");
        String s12 = s11.trim();
        return s12;
    }

    protected String fixXML(String a) {
        return a.replaceAll("&ndash;", " ").replaceAll("&amp;", " ").replaceAll("&quot;", " ").replaceAll("&minus;", " ");
    }

    protected boolean checkNumbers(String a, String b) {
        String numbersA = this.getNumbers(a);
        String numbersB = this.getNumbers(b);
        String romansA = this.getRomans(a);
        String romansB = this.getRomans(b);
        return !numbersA.equals(numbersB) || !romansA.equals(romansB);
    }

    protected String getRomans(String s) {
        StringBuilder sb = new StringBuilder();
        for (String t : s.split(" ")) {
            sb.append(this.isRoman(t) ? t : "");
        }
        return sb.toString();
    }

    protected boolean isRoman(String s) {
        return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
    }

    protected String getNumbers(String s) {
        StringBuilder sb = new StringBuilder();
        for (String t : s.split(" ")) {
            sb.append(this.isNumber(t) ? t : "");
        }
        return sb.toString();
    }

    public boolean isNumber(String strNum) {
        if (strNum == null) {
            return false;
        }
        return this.numberPattern.matcher(strNum).matches();
    }

    protected static String fixAliases(String s) {
        StringBuilder sb = new StringBuilder();
        s.chars().forEach(ch -> {
            int i = StringUtils.indexOf((CharSequence)aliases_from, (int)ch);
            sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
        });
        return sb.toString();
    }

    protected static String transliterate(String s) {
        try {
            return transliterator.transliterate(s);
        }
        catch (Exception e) {
            return s;
        }
    }

    protected String removeSymbols(String s) {
        StringBuilder sb = new StringBuilder();
        s.chars().forEach(ch -> sb.append((char)(StringUtils.contains((CharSequence)alpha, (int)ch) ? (int)ch : 32)));
        return sb.toString().replaceAll("\\s+", " ");
    }

    protected String getFirstValue(Field values) {
        return values != null && !Iterables.isEmpty((Iterable)values) ? ((Field)Iterables.getFirst((Iterable)values, (Object)EMPTY_FIELD)).stringValue() : "";
    }

    protected boolean notNull(String s) {
        return s != null;
    }

    protected String normalize(String s) {
        return AbstractPaceFunctions.fixAliases(AbstractPaceFunctions.transliterate(this.nfd(this.unicodeNormalization(s)))).toLowerCase().replaceAll("[^ \\w]+", "").replaceAll("(\\p{InCombiningDiacriticalMarks})+", "").replaceAll("(\\p{Punct})+", " ").replaceAll("(\\d)+", " ").replaceAll("(\\n)+", " ").trim();
    }

    public String nfd(String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }

    public String utf8(String s) {
        byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
        return new String(bytes, StandardCharsets.UTF_8);
    }

    public String unicodeNormalization(String s) {
        Matcher m = this.hexUnicodePattern.matcher(s);
        StringBuffer buf = new StringBuffer(s.length());
        while (m.find()) {
            String ch = String.valueOf((char)Integer.parseInt(m.group(1), 16));
            m.appendReplacement(buf, Matcher.quoteReplacement(ch));
        }
        m.appendTail(buf);
        return buf.toString();
    }

    protected String filterStopWords(String s, Set<String> stopwords) {
        StringTokenizer st = new StringTokenizer(s);
        StringBuilder sb = new StringBuilder();
        while (st.hasMoreTokens()) {
            String token = st.nextToken();
            if (stopwords.contains(token)) continue;
            sb.append(token);
            sb.append(" ");
        }
        return sb.toString().trim();
    }

    public String filterAllStopWords(String s) {
        s = this.filterStopWords(s, stopwords_en);
        s = this.filterStopWords(s, stopwords_de);
        s = this.filterStopWords(s, stopwords_it);
        s = this.filterStopWords(s, stopwords_fr);
        s = this.filterStopWords(s, stopwords_pt);
        s = this.filterStopWords(s, stopwords_es);
        s = this.filterStopWords(s, stopwords_gr);
        return s;
    }

    protected Collection<String> filterBlacklisted(Collection<String> set, Set<String> ngramBlacklist) {
        LinkedHashSet newset = Sets.newLinkedHashSet();
        for (String s : set) {
            if (ngramBlacklist.contains(s)) continue;
            newset.add(s);
        }
        return newset;
    }

    public static Set<String> loadFromClasspath(String classpath) {
        Transliterator transliterator = Transliterator.getInstance((String)"Any-Eng");
        HashSet h = Sets.newHashSet();
        try {
            for (String s : IOUtils.readLines((InputStream)NGramUtils.class.getResourceAsStream(classpath), (Charset)StandardCharsets.UTF_8)) {
                h.add(AbstractPaceFunctions.fixAliases(transliterator.transliterate(s)));
            }
        }
        catch (Throwable e) {
            return Sets.newHashSet();
        }
        return h;
    }

    public static Map<String, String> loadMapFromClasspath(String classpath) {
        Transliterator transliterator = Transliterator.getInstance((String)"Any-Eng");
        HashMap<String, String> m = new HashMap<String, String>();
        try {
            for (String s : IOUtils.readLines((InputStream)AbstractPaceFunctions.class.getResourceAsStream(classpath), (Charset)StandardCharsets.UTF_8)) {
                String[] line = s.split(";");
                String value = line[0];
                for (int i = 1; i < line.length; ++i) {
                    m.put(AbstractPaceFunctions.fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
                }
            }
        }
        catch (Throwable e) {
            return new HashMap<String, String>();
        }
        return m;
    }

    public String removeKeywords(String s, Set<String> keywords) {
        s = " " + s + " ";
        for (String k : keywords) {
            s = s.replaceAll(k.toLowerCase(), "");
        }
        return s.trim();
    }

    public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
        double longer = Math.max(s1.size(), s2.size());
        return (double)s1.stream().filter(s2::contains).count() / longer;
    }

    public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
        return keywords.stream().map(s -> (String)translationMap.get(s)).collect(Collectors.toSet());
    }

    public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
        return this.toCodes(keywords, translationMap);
    }

    public Set<String> citiesToCodes(Set<String> keywords) {
        return this.toCodes(keywords, cityMap);
    }

    protected String firstLC(String s) {
        return StringUtils.substring((String)s, (int)0, (int)1).toLowerCase();
    }

    protected Iterable<String> tokens(String s, int maxTokens) {
        return Iterables.limit((Iterable)Splitter.on((String)" ").omitEmptyStrings().trimResults().split((CharSequence)s), (int)maxTokens);
    }

    public String normalizePid(String pid) {
        return pid.toLowerCase().replaceAll("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)", "");
    }

    public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
        String s = s1;
        List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
        HashSet<String> codes = new HashSet<String>();
        if (tokens.size() < windowSize) {
            windowSize = tokens.size();
        }
        for (int length = windowSize; length != 0; --length) {
            for (int i = 0; i <= tokens.size() - length; ++i) {
                String candidate = this.concat(tokens.subList(i, i + length));
                if (!translationMap.containsKey(candidate)) continue;
                codes.add(candidate);
                s = s.replace(candidate, "").trim();
            }
            tokens = Arrays.asList(s.split(" "));
        }
        return codes;
    }

    public Set<String> getCities(String s1, int windowSize) {
        return this.getKeywords(s1, cityMap, windowSize);
    }

    public static <T> String readFromClasspath(String filename, Class<T> clazz) {
        StringWriter sw = new StringWriter();
        try {
            IOUtils.copy((InputStream)clazz.getResourceAsStream(filename), (Writer)sw, (Charset)StandardCharsets.UTF_8);
            return sw.toString();
        }
        catch (IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + filename);
        }
    }
}

