001    /**
002     * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.metadata;
016    
017    import com.liferay.portal.fabric.InputResource;
018    import com.liferay.portal.kernel.exception.SystemException;
019    import com.liferay.portal.kernel.io.DummyWriter;
020    import com.liferay.portal.kernel.log.Log;
021    import com.liferay.portal.kernel.log.LogFactoryUtil;
022    import com.liferay.portal.kernel.process.ClassPathUtil;
023    import com.liferay.portal.kernel.process.ProcessCallable;
024    import com.liferay.portal.kernel.process.ProcessChannel;
025    import com.liferay.portal.kernel.process.ProcessException;
026    import com.liferay.portal.kernel.process.ProcessExecutorUtil;
027    import com.liferay.portal.kernel.util.ArrayUtil;
028    import com.liferay.portal.kernel.util.FileUtil;
029    import com.liferay.portal.util.PropsValues;
030    
031    import java.io.File;
032    import java.io.FileInputStream;
033    import java.io.IOException;
034    import java.io.InputStream;
035    
036    import java.util.concurrent.Future;
037    
038    import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
039    import org.apache.commons.lang.exception.ExceptionUtils;
040    import org.apache.pdfbox.exceptions.CryptographyException;
041    import org.apache.poi.EncryptedDocumentException;
042    import org.apache.tika.exception.TikaException;
043    import org.apache.tika.metadata.Metadata;
044    import org.apache.tika.metadata.XMPDM;
045    import org.apache.tika.parser.ParseContext;
046    import org.apache.tika.parser.Parser;
047    import org.apache.tika.sax.WriteOutContentHandler;
048    
049    import org.xml.sax.ContentHandler;
050    
051    /**
052     * @author Miguel Pastor
053     * @author Alexander Chow
054     * @author Shuyang Zhou
055     */
056    public class TikaRawMetadataProcessor extends XugglerRawMetadataProcessor {
057    
058            public void setParser(Parser parser) {
059                    _parser = parser;
060            }
061    
062            protected static Metadata extractMetadata(
063                            File file, Metadata metadata, Parser parser)
064                    throws IOException {
065    
066                    if (metadata == null) {
067                            metadata = new Metadata();
068                    }
069    
070                    ParseContext parserContext = new ParseContext();
071    
072                    parserContext.set(Parser.class, parser);
073    
074                    ContentHandler contentHandler = new WriteOutContentHandler(
075                            new DummyWriter());
076    
077                    try (InputStream inputStream = new FileInputStream(file)) {
078                            parser.parse(inputStream, contentHandler, metadata, parserContext);
079                    }
080                    catch (Exception e) {
081                            Throwable throwable = ExceptionUtils.getRootCause(e);
082    
083                            if ((throwable instanceof CryptographyException) ||
084                                    (throwable instanceof EncryptedDocumentException) ||
085                                    (throwable instanceof UnsupportedZipFeatureException)) {
086    
087                                    if (_log.isWarnEnabled()) {
088                                            _log.warn(
089                                                    "Unable to extract metadata from an encrypted file");
090                                    }
091                            }
092                            else if (e instanceof TikaException) {
093                                    if (_log.isWarnEnabled()) {
094                                            _log.warn("Unable to extract metadata");
095                                    }
096                            }
097                            else {
098                                    _log.error(e, e);
099                            }
100    
101                            throw new IOException(e);
102                    }
103    
104                    // Remove potential security risks
105    
106                    metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
107                    metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());
108    
109                    return metadata;
110            }
111    
112            @Override
113            protected Metadata extractMetadata(
114                    String extension, String mimeType, File file) {
115    
116                    Metadata metadata = super.extractMetadata(extension, mimeType, file);
117    
118                    boolean forkProcess = false;
119    
120                    if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED) {
121                            if (ArrayUtil.contains(
122                                            PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
123                                            mimeType)) {
124    
125                                    forkProcess = true;
126                            }
127                    }
128    
129                    if (forkProcess) {
130                            ExtractMetadataProcessCallable extractMetadataProcessCallable =
131                                    new ExtractMetadataProcessCallable(file, metadata, _parser);
132    
133                            try {
134                                    ProcessChannel<Metadata> processChannel =
135                                            ProcessExecutorUtil.execute(
136                                                    ClassPathUtil.getPortalProcessConfig(),
137                                                    extractMetadataProcessCallable);
138    
139                                    Future<Metadata> future =
140                                            processChannel.getProcessNoticeableFuture();
141    
142                                    return future.get();
143                            }
144                            catch (Exception e) {
145                                    throw new SystemException(e);
146                            }
147                    }
148    
149                    try {
150                            return extractMetadata(file, metadata, _parser);
151                    }
152                    catch (IOException ioe) {
153                            throw new SystemException(ioe);
154                    }
155            }
156    
157            @Override
158            protected Metadata extractMetadata(
159                    String extension, String mimeType, InputStream inputStream) {
160    
161                    File file = FileUtil.createTempFile();
162    
163                    try {
164                            FileUtil.write(file, inputStream);
165    
166                            return extractMetadata(extension, mimeType, file);
167                    }
168                    catch (Exception e) {
169                            throw new SystemException(e);
170                    }
171                    finally {
172                            file.delete();
173                    }
174            }
175    
176            private static final Log _log = LogFactoryUtil.getLog(
177                    TikaRawMetadataProcessor.class);
178    
179            private Parser _parser;
180    
181            private static class ExtractMetadataProcessCallable
182                    implements ProcessCallable<Metadata> {
183    
184                    public ExtractMetadataProcessCallable(
185                            File file, Metadata metadata, Parser parser) {
186    
187                            _file = file;
188                            _metadata = metadata;
189                            _parser = parser;
190                    }
191    
192                    @Override
193                    public Metadata call() throws ProcessException {
194                            try {
195                                    return extractMetadata(_file, _metadata, _parser);
196                            }
197                            catch (IOException ioe) {
198                                    throw new ProcessException(ioe);
199                            }
200                    }
201    
202                    private static final long serialVersionUID = 1L;
203    
204                    @InputResource
205                    private final File _file;
206    
207                    private final Metadata _metadata;
208                    private final Parser _parser;
209    
210            }
211    
212    }