| LuceneFileExtractor.java |
1 /**
2 * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3 *
4 * The contents of this file are subject to the terms of the Liferay Enterprise
5 * Subscription License ("License"). You may not use this file except in
6 * compliance with the License. You can obtain a copy of the License by
7 * contacting Liferay, Inc. See the License for the specific language governing
8 * permissions and limitations under the License, including but not limited to
9 * distribution rights of the Software.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 * SOFTWARE.
18 */
19
20 package com.liferay.portal.search.lucene;
21
22 import com.liferay.portal.kernel.util.CharPool;
23 import com.liferay.portal.kernel.util.FileUtil;
24 import com.liferay.portal.kernel.util.Validator;
25 import com.liferay.portal.util.PropsValues;
26
27 import java.io.BufferedInputStream;
28 import java.io.ByteArrayInputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.IOException;
32 import java.io.InputStream;
33
34 import org.apache.lucene.document.Field;
35
36 /**
37 * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
38 *
39 * @author Brian Wing Shun Chan
40 *
41 */
42 public class LuceneFileExtractor {
43
44 public Field getFile(String field, InputStream is, String fileExt) {
45 String text = FileUtil.extractText(is, fileExt);
46
47 if (Validator.isNotNull(
48 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
49
50 text = regexpStrip(text);
51 }
52
53 return LuceneFields.getText(field, text);
54 }
55
56 public Field getFile(String field, byte[] bytes, String fileExt) {
57 InputStream is = new BufferedInputStream(
58 new ByteArrayInputStream(bytes));
59
60 return getFile(field, is, fileExt);
61 }
62
63 public Field getFile(String field, File file, String fileExt)
64 throws IOException {
65
66 InputStream is = new FileInputStream(file);
67
68 return getFile(field, is, fileExt);
69 }
70
71 protected String regexpStrip(String text) {
72 char[] array = text.toCharArray();
73
74 for (int i = 0; i < array.length; i++) {
75 String s = String.valueOf(array[i]);
76
77 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
78 array[i] = CharPool.SPACE;
79 }
80 }
81
82 return new String(array);
83 }
84
85 }