001    /**
002     * Copyright (c) 2000-2012 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.Html;
019    import com.liferay.portal.kernel.util.HttpUtil;
020    import com.liferay.portal.kernel.util.StringBundler;
021    import com.liferay.portal.kernel.util.StringPool;
022    import com.liferay.portal.kernel.util.StringUtil;
023    import com.liferay.portal.kernel.util.Validator;
024    
025    import java.util.regex.Matcher;
026    import java.util.regex.Pattern;
027    
028    import net.htmlparser.jericho.Source;
029    import net.htmlparser.jericho.TextExtractor;
030    
031    /**
032     * @author Brian Wing Shun Chan
033     * @author Clarence Shen
034     * @author Harry Mark
035     * @author Samuel Kong
036     * @author Connor McKay
037     * @author Shuyang Zhou
038     */
039    public class HtmlImpl implements Html {
040    
041            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
042    
043            public static final int ESCAPE_MODE_CSS = 2;
044    
045            public static final int ESCAPE_MODE_JS = 3;
046    
047            public static final int ESCAPE_MODE_TEXT = 4;
048    
049            public static final int ESCAPE_MODE_URL = 5;
050    
051            public String escape(String text) {
052                    if (text == null) {
053                            return null;
054                    }
055    
056                    if (text.length() == 0) {
057                            return StringPool.BLANK;
058                    }
059    
060                    // Escape using XSS recommendations from
061                    // http://www.owasp.org/index.php/Cross_Site_Scripting
062                    // #How_to_Protect_Yourself
063    
064                    StringBundler sb = null;
065    
066                    int lastReplacementIndex = 0;
067    
068                    for (int i = 0; i < text.length(); i++) {
069                            char c = text.charAt(i);
070    
071                            String replacement = null;
072    
073                            switch (c) {
074                                    case '<':
075                                            replacement = "&lt;";
076    
077                                            break;
078    
079                                    case '>':
080                                            replacement = "&gt;";
081    
082                                            break;
083    
084                                    case '&':
085                                            replacement = "&amp;";
086    
087                                            break;
088    
089                                    case '"':
090                                            replacement = "&#034;";
091    
092                                            break;
093    
094                                    case '\'':
095                                            replacement = "&#039;";
096    
097                                            break;
098    
099                                    case '\u00bb': // '�'
100                                            replacement = "&#187;";
101    
102                                            break;
103    
104                                    case '\u2013':
105                                            replacement = "&#x2013;";
106    
107                                            break;
108    
109                                    case '\u2014':
110                                            replacement = "&#x2014;";
111    
112                                            break;
113                            }
114    
115                            if (replacement != null) {
116                                    if (sb == null) {
117                                            sb = new StringBundler();
118                                    }
119    
120                                    if (i > lastReplacementIndex) {
121                                            sb.append(text.substring(lastReplacementIndex, i));
122                                    }
123    
124                                    sb.append(replacement);
125    
126                                    lastReplacementIndex = i + 1;
127                            }
128                    }
129    
130                    if (sb == null) {
131                            return text;
132                    }
133                    else {
134                            if (lastReplacementIndex < text.length()) {
135                                    sb.append(text.substring(lastReplacementIndex));
136                            }
137    
138                            return sb.toString();
139                    }
140            }
141    
142            public String escape(String text, int type) {
143                    if (text == null) {
144                            return null;
145                    }
146    
147                    if (text.length() == 0) {
148                            return StringPool.BLANK;
149                    }
150    
151                    String prefix = StringPool.BLANK;
152                    String postfix = StringPool.BLANK;
153    
154                    if (type == ESCAPE_MODE_ATTRIBUTE) {
155                            prefix = "&#x";
156                            postfix = StringPool.SEMICOLON;
157                    }
158                    else if (type == ESCAPE_MODE_CSS) {
159                            prefix = StringPool.BACK_SLASH;
160                    }
161                    else if (type == ESCAPE_MODE_JS) {
162                            prefix = "\\x";
163                    }
164                    else if (type == ESCAPE_MODE_URL) {
165                            return HttpUtil.encodeURL(text, true);
166                    }
167                    else {
168                            return escape(text);
169                    }
170    
171                    StringBuilder sb = new StringBuilder();
172    
173                    for (int i = 0; i < text.length(); i++) {
174                            char c = text.charAt(i);
175    
176                            if (Character.isLetterOrDigit(c) ||
177                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
178    
179                                    sb.append(c);
180                            }
181                            else {
182                                    sb.append(prefix);
183    
184                                    String hexString = StringUtil.toHexString(c);
185    
186                                    if (hexString.length() == 1) {
187                                            sb.append(StringPool.ASCII_TABLE[48]);
188                                    }
189    
190                                    sb.append(hexString);
191                                    sb.append(postfix);
192                            }
193                    }
194    
195                    if (sb.length() == text.length()) {
196                            return text;
197                    }
198                    else {
199                            return sb.toString();
200                    }
201            }
202    
203            public String escapeAttribute(String attribute) {
204                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
205            }
206    
207            public String escapeCSS(String css) {
208                    return escape(css, ESCAPE_MODE_CSS);
209            }
210    
211            public String escapeHREF(String href) {
212                    if (href == null) {
213                            return null;
214                    }
215    
216                    if (href.length() == 0) {
217                            return StringPool.BLANK;
218                    }
219    
220                    if (href.indexOf(StringPool.COLON) == 10) {
221                            String protocol = href.substring(0, 10).toLowerCase();
222    
223                            if (protocol.equals("javascript")) {
224                                    return StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
225                            }
226                    }
227    
228                    return href;
229            }
230    
231            public String escapeJS(String js) {
232                    return escape(js, ESCAPE_MODE_JS);
233            }
234    
235            public String escapeURL(String url) {
236                    return escape(url, ESCAPE_MODE_URL);
237            }
238    
239            public String escapeXPath(String xPath) {
240                    if (Validator.isNull(xPath)) {
241                            return xPath;
242                    }
243    
244                    StringBuilder sb = new StringBuilder(xPath.length());
245    
246                    for (int i = 0; i < xPath.length(); i++) {
247                            char c = xPath.charAt(i);
248    
249                            boolean hasToken = false;
250    
251                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
252                                    if (c == _XPATH_TOKENS[j]) {
253                                            hasToken = true;
254    
255                                            break;
256                                    }
257                            }
258    
259                            if (hasToken) {
260                                    sb.append(StringPool.UNDERLINE);
261                            }
262                            else {
263                                    sb.append(c);
264                            }
265                    }
266    
267                    return sb.toString();
268            }
269    
270            public String escapeXPathAttribute(String xPathAttribute) {
271                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
272                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
273    
274                    if (hasQuote && hasApostrophe) {
275                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
276    
277                            return "concat('".concat(
278                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
279                    }
280    
281                    if (hasQuote) {
282                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
283                                    StringPool.APOSTROPHE);
284                    }
285    
286                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
287            }
288    
289            public String extractText(String html) {
290                    if (html == null) {
291                            return null;
292                    }
293    
294                    Source source = new Source(html);
295    
296                    TextExtractor textExtractor = source.getTextExtractor();
297    
298                    return textExtractor.toString();
299            }
300    
301            public String fromInputSafe(String text) {
302                    return StringUtil.replace(text, "&amp;", "&");
303            }
304    
305            public String replaceMsWordCharacters(String text) {
306                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
307            }
308    
309            public String stripBetween(String text, String tag) {
310                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
311            }
312    
313            public String stripComments(String text) {
314                    return StringUtil.stripBetween(text, "<!--", "-->");
315            }
316    
317            public String stripHtml(String text) {
318                    if (text == null) {
319                            return null;
320                    }
321    
322                    text = stripComments(text);
323    
324                    StringBuilder sb = new StringBuilder(text.length());
325    
326                    int x = 0;
327                    int y = text.indexOf("<");
328    
329                    while (y != -1) {
330                            sb.append(text.substring(x, y));
331                            sb.append(StringPool.SPACE);
332    
333                            // Look for text enclosed by <script></script>
334    
335                            boolean scriptFound = isScriptTag(text, y + 1);
336    
337                            if (scriptFound) {
338                                    int pos = y + _TAG_SCRIPT.length;
339    
340                                    // Find end of the tag
341    
342                                    pos = text.indexOf(">", pos);
343    
344                                    if (pos >= 0) {
345    
346                                            // Check if preceding character is / (i.e. is this instance
347                                            // of <script/>)
348    
349                                            if (text.charAt(pos-1) != '/') {
350    
351                                                    // Search for the ending </script> tag
352    
353                                                    for (;;) {
354                                                            pos = text.indexOf("</", pos);
355    
356                                                            if (pos >= 0) {
357                                                                    if (isScriptTag(text, pos + 2)) {
358                                                                            y = pos;
359    
360                                                                            break;
361                                                                    }
362                                                                    else {
363    
364                                                                            // Skip past "</"
365    
366                                                                            pos += 2;
367                                                                    }
368                                                            }
369                                                            else {
370                                                                    break;
371                                                            }
372                                                    }
373                                            }
374                                    }
375                            }
376    
377                            x = text.indexOf(">", y);
378    
379                            if (x == -1) {
380                                    break;
381                            }
382    
383                            x++;
384    
385                            if (x < y) {
386    
387                                    // <b>Hello</b
388    
389                                    break;
390                            }
391    
392                            y = text.indexOf("<", x);
393                    }
394    
395                    if (y == -1) {
396                            sb.append(text.substring(x));
397                    }
398    
399                    return sb.toString();
400            }
401    
402            public String toInputSafe(String text) {
403                    return StringUtil.replace(
404                            text,
405                            new String[] {"&", "\""},
406                            new String[] {"&amp;", "&quot;"});
407            }
408    
409            public String unescape(String text) {
410                    if (text == null) {
411                            return null;
412                    }
413    
414                    if (text.length() == 0) {
415                            return StringPool.BLANK;
416                    }
417    
418                    // Optimize this
419    
420                    text = StringUtil.replace(text, "&lt;", "<");
421                    text = StringUtil.replace(text, "&gt;", ">");
422                    text = StringUtil.replace(text, "&amp;", "&");
423                    text = StringUtil.replace(text, "&#034;", "\"");
424                    text = StringUtil.replace(text, "&#039;", "'");
425                    text = StringUtil.replace(text, "&#040;", "(");
426                    text = StringUtil.replace(text, "&#041;", ")");
427                    text = StringUtil.replace(text, "&#044;", ",");
428                    text = StringUtil.replace(text, "&#035;", "#");
429                    text = StringUtil.replace(text, "&#037;", "%");
430                    text = StringUtil.replace(text, "&#059;", ";");
431                    text = StringUtil.replace(text, "&#061;", "=");
432                    text = StringUtil.replace(text, "&#043;", "+");
433                    text = StringUtil.replace(text, "&#045;", "-");
434    
435                    return text;
436            }
437    
438            public String unescapeCDATA(String text) {
439                    if (text == null) {
440                            return null;
441                    }
442    
443                    if (text.length() == 0) {
444                            return StringPool.BLANK;
445                    }
446    
447                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
448                    text = StringUtil.replace(text, "]]&gt;", "]]>");
449    
450                    return text;
451            }
452    
453            public String wordBreak(String text, int columns) {
454                    StringBundler sb = new StringBundler();
455    
456                    int length = 0;
457                    int lastWrite = 0;
458                    int pos = 0;
459    
460                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
461    
462                    Matcher matcher = pattern.matcher(text);
463    
464                    while (matcher.find()) {
465                            if (matcher.start() < pos) {
466                                    continue;
467                            }
468    
469                            while ((length + matcher.start() - pos) >= columns) {
470                                    pos += columns - length;
471    
472                                    sb.append(text.substring(lastWrite, pos));
473                                    sb.append("<wbr/>&shy;");
474    
475                                    length = 0;
476                                    lastWrite = pos;
477                            }
478    
479                            length += matcher.start() - pos;
480    
481                            String group = matcher.group();
482    
483                            if (group.equals(StringPool.AMPERSAND)) {
484                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
485    
486                                    if (x != -1) {
487                                            length++;
488                                            pos = x + 1;
489                                    }
490    
491                                    continue;
492                            }
493    
494                            if (group.equals(StringPool.LESS_THAN)) {
495                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
496    
497                                    if (x != -1) {
498                                            pos = x + 1;
499                                    }
500    
501                                    continue;
502                            }
503    
504                            if (group.equals(StringPool.SPACE) ||
505                                    group.equals(StringPool.NEW_LINE)) {
506    
507                                    length = 0;
508                                    pos = matcher.start() + 1;
509                            }
510                    }
511    
512                    sb.append(text.substring(lastWrite));
513    
514                    return sb.toString();
515            }
516    
517            protected boolean isScriptTag(String text, int pos) {
518                    if ((pos + _TAG_SCRIPT.length + 1) <= text.length()) {
519                            char item;
520    
521                            for (int i = 0; i < _TAG_SCRIPT.length; i++) {
522                                    item = text.charAt(pos++);
523    
524                                    if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
525                                            return false;
526                                    }
527                            }
528    
529                            item = text.charAt(pos);
530    
531                            // Check that char after "script" is not a letter (i.e. another tag)
532    
533                            return !Character.isLetter(item);
534                    }
535                    else {
536                            return false;
537                    }
538            }
539    
540            private static final String[] _MS_WORD_HTML = new String[] {
541                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
542            };
543    
544            private static final String[] _MS_WORD_UNICODE = new String[] {
545                    "\u00ae", "\u2019", "\u201c", "\u201d"
546            };
547    
548            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
549    
550            // See http://www.w3.org/TR/xpath20/#lexical-structure
551    
552            private static final char[] _XPATH_TOKENS = {
553                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
554                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
555    
556    }