| JerichoHTMLTextExtractor.java |
1 /**
2 * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
3 *
4 * This library is free software; you can redistribute it and/or modify it under
5 * the terms of the GNU Lesser General Public License as published by the Free
6 * Software Foundation; either version 2.1 of the License, or (at your option)
7 * any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12 * details.
13 */
14
15 package com.liferay.util.lucene;
16
17 import com.liferay.portal.kernel.io.unsync.UnsyncStringReader;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.Reader;
22
23 import net.htmlparser.jericho.Source;
24
25 import org.apache.jackrabbit.extractor.HTMLTextExtractor;
26
27 /**
28 * <a href="JerichoHTMLTextExtractor.java.html"><b><i>View Source</i></b></a>
29 *
30 * @author Brian Wing Shun Chan
31 */
32 public class JerichoHTMLTextExtractor extends HTMLTextExtractor {
33
34 public Reader extractText(InputStream stream, String type, String encoding)
35 throws IOException {
36
37 Source source = new Source(stream);
38
39 return new UnsyncStringReader(source.getTextExtractor().toString());
40 }
41
42 }