001    /**
002     * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * Provides the implementation of the HTML utility interface for escaping,
035     * rendering, replacing, and stripping HTML text. This class uses XSS
036     * recommendations from <a
037     * href="http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself">http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself</a>
038     * when escaping HTML text.
039     *
040     * @author Brian Wing Shun Chan
041     * @author Clarence Shen
042     * @author Harry Mark
043     * @author Samuel Kong
044     * @author Connor McKay
045     * @author Shuyang Zhou
046     */
047    @DoPrivileged
048    public class HtmlImpl implements Html {
049    
050            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
051    
052            public static final int ESCAPE_MODE_CSS = 2;
053    
054            public static final int ESCAPE_MODE_JS = 3;
055    
056            public static final int ESCAPE_MODE_TEXT = 4;
057    
058            public static final int ESCAPE_MODE_URL = 5;
059    
060            /**
061             * Escapes the text so that it is safe to use in an HTML context.
062             *
063             * @param  text the text to escape
064             * @return the escaped HTML text, or <code>null</code> if the text is
065             *         <code>null</code>
066             */
067            @Override
068            public String escape(String text) {
069                    if (text == null) {
070                            return null;
071                    }
072    
073                    if (text.length() == 0) {
074                            return StringPool.BLANK;
075                    }
076    
077                    // Escape using XSS recommendations from
078                    // http://www.owasp.org/index.php/Cross_Site_Scripting
079                    // #How_to_Protect_Yourself
080    
081                    StringBundler sb = null;
082    
083                    int lastReplacementIndex = 0;
084    
085                    for (int i = 0; i < text.length(); i++) {
086                            char c = text.charAt(i);
087    
088                            String replacement = null;
089    
090                            switch (c) {
091                                    case '<':
092                                            replacement = "&lt;";
093    
094                                            break;
095    
096                                    case '>':
097                                            replacement = "&gt;";
098    
099                                            break;
100    
101                                    case '&':
102                                            replacement = "&amp;";
103    
104                                            break;
105    
106                                    case '"':
107                                            replacement = "&#034;";
108    
109                                            break;
110    
111                                    case '\'':
112                                            replacement = "&#039;";
113    
114                                            break;
115    
116                                    case '\u00bb': // '???'
117                                            replacement = "&#187;";
118    
119                                            break;
120    
121                                    case '\u2013':
122                                            replacement = "&#x2013;";
123    
124                                            break;
125    
126                                    case '\u2014':
127                                            replacement = "&#x2014;";
128    
129                                            break;
130                            }
131    
132                            if (replacement != null) {
133                                    if (sb == null) {
134                                            sb = new StringBundler();
135                                    }
136    
137                                    if (i > lastReplacementIndex) {
138                                            sb.append(text.substring(lastReplacementIndex, i));
139                                    }
140    
141                                    sb.append(replacement);
142    
143                                    lastReplacementIndex = i + 1;
144                            }
145                    }
146    
147                    if (sb == null) {
148                            return text;
149                    }
150    
151                    if (lastReplacementIndex < text.length()) {
152                            sb.append(text.substring(lastReplacementIndex));
153                    }
154    
155                    return sb.toString();
156            }
157    
158            /**
159             * Escapes the input text as a hexadecimal value, based on the mode (type).
160             * The encoding types include: {@link #ESCAPE_MODE_ATTRIBUTE}, {@link
161             * #ESCAPE_MODE_CSS}, {@link #ESCAPE_MODE_JS}, {@link #ESCAPE_MODE_TEXT},
162             * and {@link #ESCAPE_MODE_URL}.
163             *
164             * <p>
165             * Note that <code>escape(text, ESCAPE_MODE_TEXT)</code> returns the same as
166             * <code>escape(text)</code>.
167             * </p>
168             *
169             * @param  text the text to escape
170             * @param  mode the encoding type
171             * @return the escaped hexadecimal value of the input text, based on the
172             *         mode, or <code>null</code> if the text is <code>null</code>
173             */
174            @Override
175            public String escape(String text, int mode) {
176                    if (text == null) {
177                            return null;
178                    }
179    
180                    if (text.length() == 0) {
181                            return StringPool.BLANK;
182                    }
183    
184                    String prefix = StringPool.BLANK;
185                    String postfix = StringPool.BLANK;
186    
187                    if (mode == ESCAPE_MODE_ATTRIBUTE) {
188                            prefix = "&#x";
189                            postfix = StringPool.SEMICOLON;
190                    }
191                    else if (mode == ESCAPE_MODE_CSS) {
192                            prefix = StringPool.BACK_SLASH;
193                    }
194                    else if (mode == ESCAPE_MODE_JS) {
195                            prefix = "\\x";
196                    }
197                    else if (mode == ESCAPE_MODE_URL) {
198                            return HttpUtil.encodeURL(text, true);
199                    }
200                    else {
201                            return escape(text);
202                    }
203    
204                    StringBuilder sb = new StringBuilder();
205    
206                    for (int i = 0; i < text.length(); i++) {
207                            char c = text.charAt(i);
208    
209                            if ((c > 255) || Character.isLetterOrDigit(c) ||
210                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
211    
212                                    sb.append(c);
213                            }
214                            else {
215                                    sb.append(prefix);
216    
217                                    String hexString = StringUtil.toHexString(c);
218    
219                                    if (hexString.length() == 1) {
220                                            sb.append(StringPool.ASCII_TABLE[48]);
221                                    }
222    
223                                    sb.append(hexString);
224                                    sb.append(postfix);
225                            }
226                    }
227    
228                    if (sb.length() == text.length()) {
229                            return text;
230                    }
231                    else {
232                            return sb.toString();
233                    }
234            }
235    
236            /**
237             * Escapes the attribute value so that it is safe to use as an attribute
238             * value.
239             *
240             * @param  attribute the attribute to escape
241             * @return the escaped attribute value, or <code>null</code> if the
242             *         attribute value is <code>null</code>
243             */
244            @Override
245            public String escapeAttribute(String attribute) {
246                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
247            }
248    
249            /**
250             * Escapes the CSS value so that it is safe to use in a CSS context.
251             *
252             * @param  css the CSS value to escape
253             * @return the escaped CSS value, or <code>null</code> if the CSS value is
254             *         <code>null</code>
255             */
256            @Override
257            public String escapeCSS(String css) {
258                    return escape(css, ESCAPE_MODE_CSS);
259            }
260    
261            /**
262             * Escapes the HREF attribute so that it is safe to use as an HREF
263             * attribute.
264             *
265             * @param  href the HREF attribute to escape
266             * @return the escaped HREF attribute, or <code>null</code> if the HREF
267             *         attribute is <code>null</code>
268             */
269            @Override
270            public String escapeHREF(String href) {
271                    if (href == null) {
272                            return null;
273                    }
274    
275                    if (href.length() == 0) {
276                            return StringPool.BLANK;
277                    }
278    
279                    if (href.indexOf(StringPool.COLON) == 10) {
280                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
281    
282                            if (protocol.equals("javascript")) {
283                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
284                            }
285                    }
286    
287                    return escapeAttribute(href);
288            }
289    
290            /**
291             * Escapes the JavaScript value so that it is safe to use in a JavaScript
292             * context.
293             *
294             * @param  js the JavaScript value to escape
295             * @return the escaped JavaScript value, or <code>null</code> if the
296             *         JavaScript value is <code>null</code>
297             */
298            @Override
299            public String escapeJS(String js) {
300                    return escape(js, ESCAPE_MODE_JS);
301            }
302    
303            /**
304             * Escapes the URL value so that it is safe to use as a URL.
305             *
306             * @param  url the URL value to escape
307             * @return the escaped URL value, or <code>null</code> if the URL value is
308             *         <code>null</code>
309             */
310            @Override
311            public String escapeURL(String url) {
312                    return escape(url, ESCAPE_MODE_URL);
313            }
314    
315            @Override
316            public String escapeXPath(String xPath) {
317                    if (Validator.isNull(xPath)) {
318                            return xPath;
319                    }
320    
321                    StringBuilder sb = new StringBuilder(xPath.length());
322    
323                    for (int i = 0; i < xPath.length(); i++) {
324                            char c = xPath.charAt(i);
325    
326                            boolean hasToken = false;
327    
328                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
329                                    if (c == _XPATH_TOKENS[j]) {
330                                            hasToken = true;
331    
332                                            break;
333                                    }
334                            }
335    
336                            if (hasToken) {
337                                    sb.append(StringPool.UNDERLINE);
338                            }
339                            else {
340                                    sb.append(c);
341                            }
342                    }
343    
344                    return sb.toString();
345            }
346    
347            @Override
348            public String escapeXPathAttribute(String xPathAttribute) {
349                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
350                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
351    
352                    if (hasQuote && hasApostrophe) {
353                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
354    
355                            return "concat('".concat(
356                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
357                    }
358    
359                    if (hasQuote) {
360                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
361                                    StringPool.APOSTROPHE);
362                    }
363    
364                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
365            }
366    
367            /**
368             * Extracts the raw text from the HTML input, compressing its whitespace and
369             * removing all attributes, scripts, and styles.
370             *
371             * <p>
372             * For example, raw text returned by this method can be stored in a search
373             * index.
374             * </p>
375             *
376             * @param  html the HTML text
377             * @return the raw text from the HTML input, or <code>null</code> if the
378             *         HTML input is <code>null</code>
379             */
380            @Override
381            public String extractText(String html) {
382                    if (html == null) {
383                            return null;
384                    }
385    
386                    Source source = new Source(html);
387    
388                    TextExtractor textExtractor = source.getTextExtractor();
389    
390                    return textExtractor.toString();
391            }
392    
393            @Override
394            public String fromInputSafe(String text) {
395                    return StringUtil.replace(text, "&amp;", "&");
396            }
397    
398            @Override
399            public String getAUICompatibleId(String text) {
400                    if (Validator.isNull(text)) {
401                            return text;
402                    }
403    
404                    StringBundler sb = null;
405    
406                    int lastReplacementIndex = 0;
407    
408                    for (int i = 0; i < text.length(); i++) {
409                            char c = text.charAt(i);
410    
411                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
412                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
413                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
414                                     (c != CharPool.NO_BREAK_SPACE))) {
415    
416                                    continue;
417                            }
418    
419                            if (sb == null) {
420                                    sb = new StringBundler();
421                            }
422    
423                            if (i > lastReplacementIndex) {
424                                    sb.append(text.substring(lastReplacementIndex, i));
425                            }
426    
427                            sb.append(CharPool.UNDERLINE);
428    
429                            if (c != CharPool.UNDERLINE) {
430                                    sb.append(StringUtil.toHexString(c));
431                            }
432    
433                            sb.append(CharPool.UNDERLINE);
434    
435                            lastReplacementIndex = i + 1;
436                    }
437    
438                    if (sb == null) {
439                            return text;
440                    }
441    
442                    if (lastReplacementIndex < text.length()) {
443                            sb.append(text.substring(lastReplacementIndex));
444                    }
445    
446                    return sb.toString();
447            }
448    
449            /**
450             * Renders the HTML content into text. This provides a human readable
451             * version of the content that is modeled on the way Mozilla
452             * Thunderbird&reg; and other email clients provide an automatic conversion
453             * of HTML content to text in their alternative MIME encoding of emails.
454             *
455             * <p>
456             * Using the default settings, the output complies with the
457             * <code>Text/Plain; Format=Flowed (DelSp=No)</code> protocol described in
458             * <a href="http://tools.ietf.org/html/rfc3676">RFC-3676</a>.
459             * </p>
460             *
461             * @param  html the HTML text
462             * @return the rendered HTML text, or <code>null</code> if the HTML text is
463             *         <code>null</code>
464             */
465            @Override
466            public String render(String html) {
467                    if (html == null) {
468                            return null;
469                    }
470    
471                    Source source = new Source(html);
472    
473                    Renderer renderer = source.getRenderer();
474    
475                    return renderer.toString();
476            }
477    
478            /**
479             * Replaces all Microsoft&reg; Word Unicode characters with plain HTML
480             * entities or characters.
481             *
482             * @param      text the text
483             * @return     the converted text, or <code>null</code> if the text is
484             *             <code>null</code>
485             * @deprecated As of 7.0.0, with no direct replacement
486             */
487            @Deprecated
488            @Override
489            public String replaceMsWordCharacters(String text) {
490                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
491            }
492    
493            /**
494             * Replaces all new lines or carriage returns with the <code><br /></code>
495             * HTML tag.
496             *
497             * @param  html the text
498             * @return the converted text, or <code>null</code> if the text is
499             *         <code>null</code>
500             */
501            @Override
502            public String replaceNewLine(String html) {
503                    if (html == null) {
504                            return null;
505                    }
506    
507                    html = StringUtil.replace(html, StringPool.RETURN_NEW_LINE, "<br />");
508    
509                    return StringUtil.replace(html, StringPool.NEW_LINE, "<br />");
510            }
511    
512            /**
513             * Strips all content delimited by the tag out of the text.
514             *
515             * <p>
516             * If the tag appears multiple times, all occurrences (including the tag)
517             * are stripped. The tag may have attributes. In order for this method to
518             * recognize the tag, it must consist of a separate opening and closing tag.
519             * Self-closing tags remain in the result.
520             * </p>
521             *
522             * @param  text the text
523             * @param  tag the tag used for delimiting, which should only be the tag's
524             *         name (e.g. no &lt;)
525             * @return the text, without the stripped tag and its contents, or
526             *         <code>null</code> if the text is <code>null</code>
527             */
528            @Override
529            public String stripBetween(String text, String tag) {
530                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
531            }
532    
533            /**
534             * Strips all XML comments out of the text.
535             *
536             * @param  text the text
537             * @return the text, without the stripped XML comments, or <code>null</code>
538             *         if the text is <code>null</code>
539             */
540            @Override
541            public String stripComments(String text) {
542                    return StringUtil.stripBetween(text, "<!--", "-->");
543            }
544    
545            @Override
546            public String stripHtml(String text) {
547                    if (text == null) {
548                            return null;
549                    }
550    
551                    text = stripComments(text);
552    
553                    StringBuilder sb = new StringBuilder(text.length());
554    
555                    int x = 0;
556                    int y = text.indexOf("<");
557    
558                    while (y != -1) {
559                            sb.append(text.substring(x, y));
560                            sb.append(StringPool.SPACE);
561    
562                            // Look for text enclosed by <abc></abc>
563    
564                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
565                                    y = stripTag(_TAG_SCRIPT, text, y);
566                            }
567                            else if (isTag(_TAG_STYLE, text, y + 1)) {
568                                    y = stripTag(_TAG_STYLE, text, y);
569                            }
570    
571                            x = text.indexOf(">", y);
572    
573                            if (x == -1) {
574                                    break;
575                            }
576    
577                            x++;
578    
579                            if (x < y) {
580    
581                                    // <b>Hello</b
582    
583                                    break;
584                            }
585    
586                            y = text.indexOf("<", x);
587                    }
588    
589                    if (y == -1) {
590                            sb.append(text.substring(x));
591                    }
592    
593                    return sb.toString();
594            }
595    
596            /**
597             * Encodes the text so that it's safe to use as an HTML input field value.
598             *
599             * <p>
600             * For example, the <code>&</code> character is replaced by
601             * <code>&amp;amp;</code>.
602             * </p>
603             *
604             * @param  text the text
605             * @return the encoded text that is safe to use as an HTML input field
606             *         value, or <code>null</code> if the text is <code>null</code>
607             */
608            @Override
609            public String toInputSafe(String text) {
610                    return StringUtil.replace(
611                            text,
612                            new String[] {"&", "\""},
613                            new String[] {"&amp;", "&quot;"});
614            }
615    
616            @Override
617            public String unescape(String text) {
618                    if (text == null) {
619                            return null;
620                    }
621    
622                    if (text.length() == 0) {
623                            return StringPool.BLANK;
624                    }
625    
626                    // Optimize this
627    
628                    text = StringUtil.replace(text, "&lt;", "<");
629                    text = StringUtil.replace(text, "&gt;", ">");
630                    text = StringUtil.replace(text, "&amp;", "&");
631                    text = StringUtil.replace(text, "&#034;", "\"");
632                    text = StringUtil.replace(text, "&#039;", "'");
633                    text = StringUtil.replace(text, "&#040;", "(");
634                    text = StringUtil.replace(text, "&#041;", ")");
635                    text = StringUtil.replace(text, "&#044;", ",");
636                    text = StringUtil.replace(text, "&#035;", "#");
637                    text = StringUtil.replace(text, "&#037;", "%");
638                    text = StringUtil.replace(text, "&#059;", ";");
639                    text = StringUtil.replace(text, "&#061;", "=");
640                    text = StringUtil.replace(text, "&#043;", "+");
641                    text = StringUtil.replace(text, "&#045;", "-");
642    
643                    return text;
644            }
645    
646            @Override
647            public String unescapeCDATA(String text) {
648                    if (text == null) {
649                            return null;
650                    }
651    
652                    if (text.length() == 0) {
653                            return StringPool.BLANK;
654                    }
655    
656                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
657                    text = StringUtil.replace(text, "]]&gt;", "]]>");
658    
659                    return text;
660            }
661    
662            @Override
663            public String wordBreak(String text, int columns) {
664                    StringBundler sb = new StringBundler();
665    
666                    int length = 0;
667                    int lastWrite = 0;
668                    int pos = 0;
669    
670                    Matcher matcher = _pattern.matcher(text);
671    
672                    while (matcher.find()) {
673                            if (matcher.start() < pos) {
674                                    continue;
675                            }
676    
677                            while ((length + matcher.start() - pos) >= columns) {
678                                    pos += columns - length;
679    
680                                    sb.append(text.substring(lastWrite, pos));
681                                    sb.append("<wbr/>&shy;");
682    
683                                    length = 0;
684                                    lastWrite = pos;
685                            }
686    
687                            length += matcher.start() - pos;
688    
689                            String group = matcher.group();
690    
691                            if (group.equals(StringPool.AMPERSAND)) {
692                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
693    
694                                    if (x != -1) {
695                                            length++;
696                                            pos = x + 1;
697                                    }
698    
699                                    continue;
700                            }
701    
702                            if (group.equals(StringPool.LESS_THAN)) {
703                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
704    
705                                    if (x != -1) {
706                                            pos = x + 1;
707                                    }
708    
709                                    continue;
710                            }
711    
712                            if (group.equals(StringPool.SPACE) ||
713                                    group.equals(StringPool.NEW_LINE)) {
714    
715                                    length = 0;
716                                    pos = matcher.start() + 1;
717                            }
718                    }
719    
720                    sb.append(text.substring(lastWrite));
721    
722                    return sb.toString();
723            }
724    
725            protected boolean isTag(char[] tag, String text, int pos) {
726                    if ((pos + tag.length + 1) <= text.length()) {
727                            char item;
728    
729                            for (int i = 0; i < tag.length; i++) {
730                                    item = text.charAt(pos++);
731    
732                                    if (Character.toLowerCase(item) != tag[i]) {
733                                            return false;
734                                    }
735                            }
736    
737                            item = text.charAt(pos);
738    
739                            // Check that char after tag is not a letter (i.e. another tag)
740    
741                            return !Character.isLetter(item);
742                    }
743                    else {
744                            return false;
745                    }
746            }
747    
748            protected int stripTag(char[] tag, String text, int pos) {
749                    int x = pos + _TAG_SCRIPT.length;
750    
751                    // Find end of the tag
752    
753                    x = text.indexOf(">", x);
754    
755                    if (x < 0) {
756                            return pos;
757                    }
758    
759                    // Check if preceding character is / (i.e. is this instance of <abc/>)
760    
761                    if (text.charAt(x-1) == '/') {
762                            return pos;
763                    }
764    
765                    // Search for the ending </abc> tag
766    
767                    while (true) {
768                            x = text.indexOf("</", x);
769    
770                            if (x >= 0) {
771                                    if (isTag(tag, text, x + 2)) {
772                                            pos = x;
773    
774                                            break;
775                                    }
776                                    else {
777    
778                                            // Skip past "</"
779    
780                                            x += 2;
781                                    }
782                            }
783                            else {
784                                    break;
785                            }
786                    }
787    
788                    return pos;
789            }
790    
791            private static final String[] _MS_WORD_HTML = new String[] {
792                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
793            };
794    
795            private static final String[] _MS_WORD_UNICODE = new String[] {
796                    "\u00ae", "\u2019", "\u201c", "\u201d"
797            };
798    
799            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
800    
801            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
802    
803            // See http://www.w3.org/TR/xpath20/#lexical-structure
804    
805            private static final char[] _XPATH_TOKENS = {
806                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
807                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
808    
809            private Pattern _pattern = Pattern.compile("([\\s<&]|$)");
810    
811    }