/*
 * Decompiled with CFR 0.152.
 */
package eu.dnetlib.dhp.sx.bio.ebi;

import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.AbstractScalaApplication;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump$;
import eu.dnetlib.dhp.sx.bio.pubmed.PMArticle;
import eu.dnetlib.dhp.sx.bio.pubmed.PMParser2;
import eu.dnetlib.dhp.sx.bio.pubmed.PubMedToOaf$;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders$;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import scala.Function1;
import scala.Predef$;
import scala.Serializable;
import scala.StringContext;
import scala.collection.Seq;
import scala.collection.mutable.StringBuilder;
import scala.reflect.ScalaSignature;
import scala.runtime.BoxesRunTime;

@ScalaSignature(bytes="\u0006\u0001-4A!\u0001\u0002\u0001\u001f\t)2\u000b]1sW\u000e\u0013X-\u0019;f!V\u0014W.\u001a3Ek6\u0004(BA\u0002\u0005\u0003\r)'-\u001b\u0006\u0003\u000b\u0019\t1AY5p\u0015\t9\u0001\"\u0001\u0002tq*\u0011\u0011BC\u0001\u0004I\"\u0004(BA\u0006\r\u0003\u001d!g.\u001a;mS\nT\u0011!D\u0001\u0003KV\u001c\u0001a\u0005\u0002\u0001!A\u0011\u0011\u0003F\u0007\u0002%)\u00111\u0003C\u0001\fCB\u0004H.[2bi&|g.\u0003\u0002\u0016%\tA\u0012IY:ue\u0006\u001cGoU2bY\u0006\f\u0005\u000f\u001d7jG\u0006$\u0018n\u001c8\t\u0013]\u0001!\u0011!Q\u0001\na\u0011\u0013\u0001\u00049s_B,'\u000f^=QCRD\u0007CA\r \u001d\tQR$D\u0001\u001c\u0015\u0005a\u0012!B:dC2\f\u0017B\u0001\u0010\u001c\u0003\u0019\u0001&/\u001a3fM&\u0011\u0001%\t\u0002\u0007'R\u0014\u0018N\\4\u000b\u0005yY\u0012BA\f\u0015\u0011%!\u0003A!A!\u0002\u0013)\u0003&\u0001\u0003be\u001e\u001c\bc\u0001\u000e'1%\u0011qe\u0007\u0002\u0006\u0003J\u0014\u0018-_\u0005\u0003IQA\u0001B\u000b\u0001\u0003\u0002\u0003\u0006IaK\u0001\u0004Y><\u0007C\u0001\u00172\u001b\u0005i#B\u0001\u00180\u0003\u0015\u0019HN\u001a\u001bk\u0015\u0005\u0001\u0014aA8sO&\u0011!'\f\u0002\u0007\u0019><w-\u001a:\t\u000bQ\u0002A\u0011A\u001b\u0002\rqJg.\u001b;?)\u00111\u0004(\u000f\u001e\u0011\u0005]\u0002Q\"\u0001\u0002\t\u000b]\u0019\u0004\u0019\u0001\r\t\u000b\u0011\u001a\u0004\u0019A\u0013\t\u000b)\u001a\u0004\u0019A\u0016\t\u000bq\u0002A\u0011I\u001f\u0002\u0007I,h\u000eF\u0001?!\tQr(\u0003\u0002A7\t!QK\\5u\u0011\u0015\u0011\u0005\u0001\"\u0001D\u0003A\u0019'/Z1uKB+(-\\3e\tVl\u0007\u000fF\u0003?\t>\u000b6\u000bC\u0003F\u0003\u0002\u0007a)A\u0003ta\u0006\u00148\u000e\u0005\u0002H\u001b6\t\u0001J\u0003\u0002J\u0015\u0006\u00191/\u001d7\u000b\u0005\u0015[%B\u0001'0\u0003\u0019\t\u0007/Y2iK&\u0011a\n\u0013\u0002\r'B\f'o[*fgNLwN\u001c\u0005\u0006!\u0006\u0003\r\u0001G\u0001\u000bg>,(oY3QCRD\u0007\"\u0002*B\u0001\u0004A\u0012A\u0003;be\u001e,G\u000fU1uQ\")A+\u0011a\u0001+\u0006aao\\2bEVd\u0017M]5fgB\u0011akW\u0007\u0002/*\u0011\u0001,W\u0001\u000bm>\u001c\u0017MY;mCJL(B\u0001.\t\u0003\u0019\u0019w.\\7p]&\u0011Al\u0016\u0002\u0010->\u001c\u0017MY;mCJLxI]8va\u001e)aL\u0001E\u0001?\u0006)2\u000b]1sW\u000e\u0013X-\u0019;f!V\u0014W.\u001a3Ek6\u0004\bCA\u001ca\r\u0015\t!\u0001#\u0001b'\t\u0001'\r\u0005\u0002\u001bG&\u0011Am\u0007\u0002\u0007\u0003:L(+\u001a4\t\u000bQ\u0002G\u0011\u00014\u0015\u0003}CQ\u0001\u001b1\u0005\u0002%\fA!\\1j]R\u0011aH\u001b\u0005\u0006I\u001d\u0004\r!\n")
public class SparkCreatePubmedDump
extends AbstractScalaApplication {
    private final Logger log;

    public static void main(String[] stringArray) {
        SparkCreatePubmedDump$.MODULE$.main(stringArray);
    }

    public void run() {
        String isLookupUrl = this.parser().get("isLookupUrl");
        this.log.info("isLookupUrl: {}", new Object[]{isLookupUrl});
        String sourcePath = this.parser().get("sourcePath");
        this.log.info(new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"SourcePath is '", "'"})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{sourcePath})));
        String mdstoreOutputVersion = this.parser().get("mdstoreOutputVersion");
        this.log.info(new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"mdstoreOutputVersion is '", "'"})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{mdstoreOutputVersion})));
        ObjectMapper mapper = new ObjectMapper();
        MDStoreVersion cleanedMdStoreVersion = (MDStoreVersion)mapper.readValue(mdstoreOutputVersion, MDStoreVersion.class);
        String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
        this.log.info(new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"outputBasePath is '", "'"})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{outputBasePath})));
        ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService((String)isLookupUrl);
        VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS((ISLookUpService)isLookupService);
        this.createPubmedDump(this.spark(), sourcePath, outputBasePath, vocabularies);
    }

    public void createPubmedDump(SparkSession spark, String sourcePath, String targetPath, VocabularyGroup vocabularies) {
        Predef$.MODULE$.require(spark != null);
        Encoder PMEncoder = Encoders$.MODULE$.bean(PMArticle.class);
        Dataset df = spark.read().option("lineSep", "</PubmedArticle>").text(sourcePath);
        ObjectMapper mapper = new ObjectMapper();
        df.as(spark.implicits().newStringEncoder()).map((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final String apply(String s) {
                int id = s.indexOf("<PubmedArticle>");
                return id >= 0 ? new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"", "</PubmedArticle>"})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{s.substring(id)})) : null;
            }
        }, spark.implicits().newStringEncoder()).filter((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final boolean apply(String s) {
                return s != null;
            }
        }).map((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final PMArticle apply(String i) {
                try {
                    return new PMParser2().parse(i);
                }
                catch (Exception exception) {
                    throw new RuntimeException(new StringContext((Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[]{"Error parsing article: ", ""})).s((Seq)Predef$.MODULE$.genericWrapArray((Object)new Object[]{i})));
                }
            }
        }, PMEncoder).dropDuplicates("pmid", (Seq)Predef$.MODULE$.wrapRefArray((Object[])new String[0])).map((Function1)new Serializable(this, vocabularies, mapper){
            public static final long serialVersionUID = 0L;
            private final VocabularyGroup vocabularies$1;
            private final ObjectMapper mapper$1;

            public final String apply(PMArticle a) {
                Oaf oaf = PubMedToOaf$.MODULE$.convert(a, this.vocabularies$1);
                return oaf == null ? null : this.mapper$1.writeValueAsString((Object)oaf);
            }
            {
                this.vocabularies$1 = vocabularies$1;
                this.mapper$1 = mapper$1;
            }
        }, spark.implicits().newStringEncoder()).as(spark.implicits().newStringEncoder()).filter((Function1)new Serializable(this){
            public static final long serialVersionUID = 0L;

            public final boolean apply(String s) {
                return s != null;
            }
        }).write().option("compression", "gzip").mode("overwrite").text(new StringBuilder().append((Object)targetPath).append((Object)"/store").toString());
        long mdStoreSize = spark.read().text(new StringBuilder().append((Object)targetPath).append((Object)"/store").toString()).count();
        DHPUtils.writeHdfsFile((Configuration)spark.sparkContext().hadoopConfiguration(), (String)String.valueOf(BoxesRunTime.boxToLong((long)mdStoreSize)), (String)new StringBuilder().append((Object)targetPath).append((Object)"/size").toString());
    }

    public SparkCreatePubmedDump(String propertyPath, String[] args, Logger log) {
        this.log = log;
        super(propertyPath, args, log);
    }
}

