| LuceneFileExtractor.java |
1 /**
2 * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 package com.liferay.portal.search.lucene;
24
25 import com.liferay.portal.kernel.util.CharPool;
26 import com.liferay.portal.kernel.util.FileUtil;
27 import com.liferay.portal.kernel.util.Validator;
28 import com.liferay.portal.util.PropsValues;
29
30 import java.io.BufferedInputStream;
31 import java.io.ByteArrayInputStream;
32 import java.io.File;
33 import java.io.FileInputStream;
34 import java.io.IOException;
35 import java.io.InputStream;
36
37 import org.apache.lucene.document.Field;
38
39 /**
40 * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
41 *
42 * @author Brian Wing Shun Chan
43 *
44 */
45 public class LuceneFileExtractor {
46
47 public Field getFile(String field, InputStream is, String fileExt) {
48 String text = FileUtil.extractText(is, fileExt);
49
50 if (Validator.isNotNull(
51 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
52
53 text = regexpStrip(text);
54 }
55
56 return LuceneFields.getText(field, text);
57 }
58
59 public Field getFile(String field, byte[] bytes, String fileExt) {
60 InputStream is = new BufferedInputStream(
61 new ByteArrayInputStream(bytes));
62
63 return getFile(field, is, fileExt);
64 }
65
66 public Field getFile(String field, File file, String fileExt)
67 throws IOException {
68
69 InputStream is = new FileInputStream(file);
70
71 return getFile(field, is, fileExt);
72 }
73
74 protected String regexpStrip(String text) {
75 char[] array = text.toCharArray();
76
77 for (int i = 0; i < array.length; i++) {
78 String s = String.valueOf(array[i]);
79
80 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
81 array[i] = CharPool.SPACE;
82 }
83 }
84
85 return new String(array);
86 }
87
88 }