package gate.corpora;

import gate.Document;
import gate.DocumentFormat;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.event.StatusListener;
import gate.util.DocumentFormatException;
import gate.xml.XmlDocumentHandler;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.SAXException;

@CreoleResource(name = "Apache Tika Document Format", isPrivate = true, autoinstances = {@AutoInstance(hidden = true)})
/* loaded from: input_file:gate/corpora/TikaFormat.class */
public class TikaFormat extends DocumentFormat {
    private static final long serialVersionUID = 1;
    private static final Logger log = Logger.getLogger(TikaFormat.class);

    @Override // gate.creole.AbstractResource, gate.Resource
    public Resource init() throws ResourceInstantiationException {
        super.init();
        setMimeType(new MimeType("application", "tika"));
        assignMime(getMimeType(), new String[0]);
        assignMime(new MimeType("application", "pdf"), "pdf");
        assignMime(new MimeType("application", "msword"), "doc");
        assignMime(new MimeType("application", "vnd.ms-powerpoint"), "ppt");
        assignMime(new MimeType("application", "vnd.ms-excel"), "xls");
        assignMime(new MimeType("application", "vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
        assignMime(new MimeType("application", "vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
        assignMime(new MimeType("application", "vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
        assignMime(new MimeType("application", "vnd.oasis.opendocument.text"), "odt");
        assignMime(new MimeType("application", "vnd.oasis.opendocument.presentation"), "odp");
        assignMime(new MimeType("application", "vnd.oasis.opendocument.spreadsheet"), "ods");
        assignMime(new MimeType("application", "rtf"), "rtf");
        return this;
    }

    private void assignMime(MimeType mimeType, String... strArr) {
        String str = mimeType.getType() + "/" + mimeType.getSubtype();
        mimeString2ClassHandlerMap.put(str, this);
        mimeString2mimeTypeMap.put(str, mimeType);
        for (String str2 : strArr) {
            suffixes2mimeTypeMap.put(str2, mimeType);
        }
    }

    @Override // gate.DocumentFormat
    public Boolean supportsRepositioning() {
        return true;
    }

    @Override // gate.DocumentFormat
    public void unpackMarkup(Document document) throws DocumentFormatException {
        unpackMarkup(document, null, null);
    }

    @Override // gate.DocumentFormat
    public void unpackMarkup(Document document, RepositioningInfo repositioningInfo, RepositioningInfo repositioningInfo2) throws DocumentFormatException {
        if (document == null || document.getSourceUrl() == null) {
            throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
        }
        StatusListener statusListener = new StatusListener() { // from class: gate.corpora.TikaFormat.1
            @Override // gate.event.StatusListener
            public void statusChanged(String str) {
                TikaFormat.this.fireStatusChanged(str);
            }
        };
        XmlDocumentHandler xmlDocumentHandler = new XmlDocumentHandler(document, this.markupElementsMap, this.element2StringMap);
        Metadata extractParserTips = extractParserTips(document);
        xmlDocumentHandler.addStatusListener(statusListener);
        xmlDocumentHandler.setRepositioningInfo(repositioningInfo);
        xmlDocumentHandler.setAmpCodingInfo(repositioningInfo2);
        InputStream inputStream = null;
        try {
            try {
                Parser parser = new TikaConfig().getParser();
                inputStream = document.getSourceUrl().openStream();
                parser.parse(inputStream, xmlDocumentHandler, extractParserTips, new ParseContext());
                setDocumentFeatures(extractParserTips, document);
                IOUtils.closeQuietly(inputStream);
                xmlDocumentHandler.removeStatusListener(statusListener);
                if (document instanceof DocumentImpl) {
                    ((DocumentImpl) document).setNextAnnotationId(xmlDocumentHandler.getCustomObjectsId());
                }
            } catch (TikaException e) {
                throw new DocumentFormatException((Exception) e);
            } catch (IOException e2) {
                throw new DocumentFormatException(e2);
            } catch (SAXException e3) {
                throw new DocumentFormatException(e3);
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(inputStream);
            xmlDocumentHandler.removeStatusListener(statusListener);
            throw th;
        }
    }

    private void setDocumentFeatures(Metadata metadata, Document document) {
        FeatureMap features = document.getFeatures();
        setTikaFeature(metadata, "title", features);
        setTikaFeature(metadata, "Author", features);
        setTikaFeature(metadata, "Comments", features);
        setTikaFeature(metadata, "creator", features);
        if (features.get("AUTHORS") == null && features.get("AUTHOR") != null) {
            features.put("AUTHORS", features.get("Author"));
        }
        features.put("MimeType", metadata.get("Content-Type"));
    }

    private void setTikaFeature(Metadata metadata, String str, Map map) {
        String str2 = metadata.get(str);
        if (str2 == null) {
            return;
        }
        String trim = str2.trim();
        if (trim.length() == 0) {
            return;
        }
        String upperCase = str.toUpperCase();
        if (map.containsKey(upperCase)) {
            map.put("TIKA_" + upperCase, trim);
        } else {
            map.put(upperCase, trim);
            map.put("TIKA_" + upperCase, trim);
        }
    }

    private Metadata extractParserTips(Document document) {
        Metadata metadata = new Metadata();
        Object obj = document.getFeatures().get("MimeType");
        if ((obj instanceof String) && !"application/tika".equals(obj)) {
            metadata.add("Content-Type", (String) document.getFeatures().get("MimeType"));
        }
        if ((document instanceof DocumentImpl) && ((DocumentImpl) document).getMimeType() != null) {
            metadata.add("Content-Type", ((DocumentImpl) document).getMimeType());
        }
        if (document.getSourceUrl() != null && document.getSourceUrl().getProtocol().startsWith("file")) {
            try {
                metadata.add("resourceName", new File(document.getSourceUrl().toURI()).getName());
            } catch (IllegalArgumentException e) {
                log.debug("Could not extract filename from uri: " + document.getSourceUrl(), e);
            } catch (URISyntaxException e2) {
                log.debug("Could not extract filename from uri: " + document.getSourceUrl(), e2);
            }
        }
        return metadata;
    }
}
