001    /**
002     * Copyright (c) 2000-2012 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.Html;
019    import com.liferay.portal.kernel.util.HttpUtil;
020    import com.liferay.portal.kernel.util.StringBundler;
021    import com.liferay.portal.kernel.util.StringPool;
022    import com.liferay.portal.kernel.util.StringUtil;
023    import com.liferay.portal.kernel.util.Validator;
024    
025    import java.util.regex.Matcher;
026    import java.util.regex.Pattern;
027    
028    import net.htmlparser.jericho.Renderer;
029    import net.htmlparser.jericho.Source;
030    import net.htmlparser.jericho.TextExtractor;
031    
032    /**
033     * @author Brian Wing Shun Chan
034     * @author Clarence Shen
035     * @author Harry Mark
036     * @author Samuel Kong
037     * @author Connor McKay
038     * @author Shuyang Zhou
039     */
040    public class HtmlImpl implements Html {
041    
042            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
043    
044            public static final int ESCAPE_MODE_CSS = 2;
045    
046            public static final int ESCAPE_MODE_JS = 3;
047    
048            public static final int ESCAPE_MODE_TEXT = 4;
049    
050            public static final int ESCAPE_MODE_URL = 5;
051    
052            public String escape(String text) {
053                    if (text == null) {
054                            return null;
055                    }
056    
057                    if (text.length() == 0) {
058                            return StringPool.BLANK;
059                    }
060    
061                    // Escape using XSS recommendations from
062                    // http://www.owasp.org/index.php/Cross_Site_Scripting
063                    // #How_to_Protect_Yourself
064    
065                    StringBundler sb = null;
066    
067                    int lastReplacementIndex = 0;
068    
069                    for (int i = 0; i < text.length(); i++) {
070                            char c = text.charAt(i);
071    
072                            String replacement = null;
073    
074                            switch (c) {
075                                    case '<':
076                                            replacement = "&lt;";
077    
078                                            break;
079    
080                                    case '>':
081                                            replacement = "&gt;";
082    
083                                            break;
084    
085                                    case '&':
086                                            replacement = "&amp;";
087    
088                                            break;
089    
090                                    case '"':
091                                            replacement = "&#034;";
092    
093                                            break;
094    
095                                    case '\'':
096                                            replacement = "&#039;";
097    
098                                            break;
099    
100                                    case '\u00bb': // '�'
101                                            replacement = "&#187;";
102    
103                                            break;
104    
105                                    case '\u2013':
106                                            replacement = "&#x2013;";
107    
108                                            break;
109    
110                                    case '\u2014':
111                                            replacement = "&#x2014;";
112    
113                                            break;
114                            }
115    
116                            if (replacement != null) {
117                                    if (sb == null) {
118                                            sb = new StringBundler();
119                                    }
120    
121                                    if (i > lastReplacementIndex) {
122                                            sb.append(text.substring(lastReplacementIndex, i));
123                                    }
124    
125                                    sb.append(replacement);
126    
127                                    lastReplacementIndex = i + 1;
128                            }
129                    }
130    
131                    if (sb == null) {
132                            return text;
133                    }
134                    else {
135                            if (lastReplacementIndex < text.length()) {
136                                    sb.append(text.substring(lastReplacementIndex));
137                            }
138    
139                            return sb.toString();
140                    }
141            }
142    
143            public String escape(String text, int type) {
144                    if (text == null) {
145                            return null;
146                    }
147    
148                    if (text.length() == 0) {
149                            return StringPool.BLANK;
150                    }
151    
152                    String prefix = StringPool.BLANK;
153                    String postfix = StringPool.BLANK;
154    
155                    if (type == ESCAPE_MODE_ATTRIBUTE) {
156                            prefix = "&#x";
157                            postfix = StringPool.SEMICOLON;
158                    }
159                    else if (type == ESCAPE_MODE_CSS) {
160                            prefix = StringPool.BACK_SLASH;
161                    }
162                    else if (type == ESCAPE_MODE_JS) {
163                            prefix = "\\x";
164                    }
165                    else if (type == ESCAPE_MODE_URL) {
166                            return HttpUtil.encodeURL(text, true);
167                    }
168                    else {
169                            return escape(text);
170                    }
171    
172                    StringBuilder sb = new StringBuilder();
173    
174                    for (int i = 0; i < text.length(); i++) {
175                            char c = text.charAt(i);
176    
177                            if (Character.isLetterOrDigit(c) ||
178                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
179    
180                                    sb.append(c);
181                            }
182                            else {
183                                    sb.append(prefix);
184    
185                                    String hexString = StringUtil.toHexString(c);
186    
187                                    if (hexString.length() == 1) {
188                                            sb.append(StringPool.ASCII_TABLE[48]);
189                                    }
190    
191                                    sb.append(hexString);
192                                    sb.append(postfix);
193                            }
194                    }
195    
196                    if (sb.length() == text.length()) {
197                            return text;
198                    }
199                    else {
200                            return sb.toString();
201                    }
202            }
203    
204            public String escapeAttribute(String attribute) {
205                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
206            }
207    
208            public String escapeCSS(String css) {
209                    return escape(css, ESCAPE_MODE_CSS);
210            }
211    
212            public String escapeHREF(String href) {
213                    if (href == null) {
214                            return null;
215                    }
216    
217                    if (href.length() == 0) {
218                            return StringPool.BLANK;
219                    }
220    
221                    if (href.indexOf(StringPool.COLON) == 10) {
222                            String protocol = href.substring(0, 10).toLowerCase();
223    
224                            if (protocol.equals("javascript")) {
225                                    return StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
226                            }
227                    }
228    
229                    return href;
230            }
231    
232            public String escapeJS(String js) {
233                    return escape(js, ESCAPE_MODE_JS);
234            }
235    
236            public String escapeURL(String url) {
237                    return escape(url, ESCAPE_MODE_URL);
238            }
239    
240            public String escapeXPath(String xPath) {
241                    if (Validator.isNull(xPath)) {
242                            return xPath;
243                    }
244    
245                    StringBuilder sb = new StringBuilder(xPath.length());
246    
247                    for (int i = 0; i < xPath.length(); i++) {
248                            char c = xPath.charAt(i);
249    
250                            boolean hasToken = false;
251    
252                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
253                                    if (c == _XPATH_TOKENS[j]) {
254                                            hasToken = true;
255    
256                                            break;
257                                    }
258                            }
259    
260                            if (hasToken) {
261                                    sb.append(StringPool.UNDERLINE);
262                            }
263                            else {
264                                    sb.append(c);
265                            }
266                    }
267    
268                    return sb.toString();
269            }
270    
271            public String escapeXPathAttribute(String xPathAttribute) {
272                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
273                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
274    
275                    if (hasQuote && hasApostrophe) {
276                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
277    
278                            return "concat('".concat(
279                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
280                    }
281    
282                    if (hasQuote) {
283                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
284                                    StringPool.APOSTROPHE);
285                    }
286    
287                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
288            }
289    
290            public String extractText(String html) {
291                    if (html == null) {
292                            return null;
293                    }
294    
295                    Source source = new Source(html);
296    
297                    TextExtractor textExtractor = source.getTextExtractor();
298    
299                    return textExtractor.toString();
300            }
301    
302            public String fromInputSafe(String text) {
303                    return StringUtil.replace(text, "&amp;", "&");
304            }
305    
306            public String render(String html) {
307                    if (html == null) {
308                            return null;
309                    }
310    
311                    Source source = new Source(html);
312    
313                    Renderer renderer = source.getRenderer();
314    
315                    return renderer.toString();
316            }
317    
318            public String replaceMsWordCharacters(String text) {
319                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
320            }
321    
322            public String stripBetween(String text, String tag) {
323                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
324            }
325    
326            public String stripComments(String text) {
327                    return StringUtil.stripBetween(text, "<!--", "-->");
328            }
329    
330            public String stripHtml(String text) {
331                    if (text == null) {
332                            return null;
333                    }
334    
335                    text = stripComments(text);
336    
337                    StringBuilder sb = new StringBuilder(text.length());
338    
339                    int x = 0;
340                    int y = text.indexOf("<");
341    
342                    while (y != -1) {
343                            sb.append(text.substring(x, y));
344                            sb.append(StringPool.SPACE);
345    
346                            // Look for text enclosed by <script></script>
347    
348                            boolean scriptFound = isScriptTag(text, y + 1);
349    
350                            if (scriptFound) {
351                                    int pos = y + _TAG_SCRIPT.length;
352    
353                                    // Find end of the tag
354    
355                                    pos = text.indexOf(">", pos);
356    
357                                    if (pos >= 0) {
358    
359                                            // Check if preceding character is / (i.e. is this instance
360                                            // of <script/>)
361    
362                                            if (text.charAt(pos-1) != '/') {
363    
364                                                    // Search for the ending </script> tag
365    
366                                                    for (;;) {
367                                                            pos = text.indexOf("</", pos);
368    
369                                                            if (pos >= 0) {
370                                                                    if (isScriptTag(text, pos + 2)) {
371                                                                            y = pos;
372    
373                                                                            break;
374                                                                    }
375                                                                    else {
376    
377                                                                            // Skip past "</"
378    
379                                                                            pos += 2;
380                                                                    }
381                                                            }
382                                                            else {
383                                                                    break;
384                                                            }
385                                                    }
386                                            }
387                                    }
388                            }
389    
390                            x = text.indexOf(">", y);
391    
392                            if (x == -1) {
393                                    break;
394                            }
395    
396                            x++;
397    
398                            if (x < y) {
399    
400                                    // <b>Hello</b
401    
402                                    break;
403                            }
404    
405                            y = text.indexOf("<", x);
406                    }
407    
408                    if (y == -1) {
409                            sb.append(text.substring(x));
410                    }
411    
412                    return sb.toString();
413            }
414    
415            public String toInputSafe(String text) {
416                    return StringUtil.replace(
417                            text,
418                            new String[] {"&", "\""},
419                            new String[] {"&amp;", "&quot;"});
420            }
421    
422            public String unescape(String text) {
423                    if (text == null) {
424                            return null;
425                    }
426    
427                    if (text.length() == 0) {
428                            return StringPool.BLANK;
429                    }
430    
431                    // Optimize this
432    
433                    text = StringUtil.replace(text, "&lt;", "<");
434                    text = StringUtil.replace(text, "&gt;", ">");
435                    text = StringUtil.replace(text, "&amp;", "&");
436                    text = StringUtil.replace(text, "&#034;", "\"");
437                    text = StringUtil.replace(text, "&#039;", "'");
438                    text = StringUtil.replace(text, "&#040;", "(");
439                    text = StringUtil.replace(text, "&#041;", ")");
440                    text = StringUtil.replace(text, "&#044;", ",");
441                    text = StringUtil.replace(text, "&#035;", "#");
442                    text = StringUtil.replace(text, "&#037;", "%");
443                    text = StringUtil.replace(text, "&#059;", ";");
444                    text = StringUtil.replace(text, "&#061;", "=");
445                    text = StringUtil.replace(text, "&#043;", "+");
446                    text = StringUtil.replace(text, "&#045;", "-");
447    
448                    return text;
449            }
450    
451            public String unescapeCDATA(String text) {
452                    if (text == null) {
453                            return null;
454                    }
455    
456                    if (text.length() == 0) {
457                            return StringPool.BLANK;
458                    }
459    
460                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
461                    text = StringUtil.replace(text, "]]&gt;", "]]>");
462    
463                    return text;
464            }
465    
466            public String wordBreak(String text, int columns) {
467                    StringBundler sb = new StringBundler();
468    
469                    int length = 0;
470                    int lastWrite = 0;
471                    int pos = 0;
472    
473                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
474    
475                    Matcher matcher = pattern.matcher(text);
476    
477                    while (matcher.find()) {
478                            if (matcher.start() < pos) {
479                                    continue;
480                            }
481    
482                            while ((length + matcher.start() - pos) >= columns) {
483                                    pos += columns - length;
484    
485                                    sb.append(text.substring(lastWrite, pos));
486                                    sb.append("<wbr/>&shy;");
487    
488                                    length = 0;
489                                    lastWrite = pos;
490                            }
491    
492                            length += matcher.start() - pos;
493    
494                            String group = matcher.group();
495    
496                            if (group.equals(StringPool.AMPERSAND)) {
497                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
498    
499                                    if (x != -1) {
500                                            length++;
501                                            pos = x + 1;
502                                    }
503    
504                                    continue;
505                            }
506    
507                            if (group.equals(StringPool.LESS_THAN)) {
508                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
509    
510                                    if (x != -1) {
511                                            pos = x + 1;
512                                    }
513    
514                                    continue;
515                            }
516    
517                            if (group.equals(StringPool.SPACE) ||
518                                    group.equals(StringPool.NEW_LINE)) {
519    
520                                    length = 0;
521                                    pos = matcher.start() + 1;
522                            }
523                    }
524    
525                    sb.append(text.substring(lastWrite));
526    
527                    return sb.toString();
528            }
529    
530            protected boolean isScriptTag(String text, int pos) {
531                    if ((pos + _TAG_SCRIPT.length + 1) <= text.length()) {
532                            char item;
533    
534                            for (int i = 0; i < _TAG_SCRIPT.length; i++) {
535                                    item = text.charAt(pos++);
536    
537                                    if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
538                                            return false;
539                                    }
540                            }
541    
542                            item = text.charAt(pos);
543    
544                            // Check that char after "script" is not a letter (i.e. another tag)
545    
546                            return !Character.isLetter(item);
547                    }
548                    else {
549                            return false;
550                    }
551            }
552    
553            private static final String[] _MS_WORD_HTML = new String[] {
554                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
555            };
556    
557            private static final String[] _MS_WORD_UNICODE = new String[] {
558                    "\u00ae", "\u2019", "\u201c", "\u201d"
559            };
560    
561            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
562    
563            // See http://www.w3.org/TR/xpath20/#lexical-structure
564    
565            private static final char[] _XPATH_TOKENS = {
566                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
567                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
568    
569    }