001    /**
002     * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.HashMap;
027    import java.util.Map;
028    import java.util.regex.Matcher;
029    import java.util.regex.Pattern;
030    
031    import net.htmlparser.jericho.Renderer;
032    import net.htmlparser.jericho.Source;
033    import net.htmlparser.jericho.TextExtractor;
034    
035    /**
036     * Provides the implementation of the HTML utility interface for escaping,
037     * rendering, replacing, and stripping HTML text. This class uses XSS
038     * recommendations from <a
039     * href="http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself">http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself</a>
040     * when escaping HTML text.
041     *
042     * @author Brian Wing Shun Chan
043     * @author Clarence Shen
044     * @author Harry Mark
045     * @author Samuel Kong
046     * @author Connor McKay
047     * @author Shuyang Zhou
048     */
049    @DoPrivileged
050    public class HtmlImpl implements Html {
051    
052            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
053    
054            public static final int ESCAPE_MODE_CSS = 2;
055    
056            public static final int ESCAPE_MODE_JS = 3;
057    
058            public static final int ESCAPE_MODE_TEXT = 4;
059    
060            public static final int ESCAPE_MODE_URL = 5;
061    
062            /**
063             * Escapes the text so that it is safe to use in an HTML context.
064             *
065             * @param  text the text to escape
066             * @return the escaped HTML text, or <code>null</code> if the text is
067             *         <code>null</code>
068             */
069            @Override
070            public String escape(String text) {
071                    if (text == null) {
072                            return null;
073                    }
074    
075                    if (text.length() == 0) {
076                            return StringPool.BLANK;
077                    }
078    
079                    // Escape using XSS recommendations from
080                    // http://www.owasp.org/index.php/Cross_Site_Scripting
081                    // #How_to_Protect_Yourself
082    
083                    StringBundler sb = null;
084    
085                    int lastReplacementIndex = 0;
086    
087                    for (int i = 0; i < text.length(); i++) {
088                            char c = text.charAt(i);
089    
090                            String replacement = null;
091    
092                            switch (c) {
093                                    case '<':
094                                            replacement = "&lt;";
095    
096                                            break;
097    
098                                    case '>':
099                                            replacement = "&gt;";
100    
101                                            break;
102    
103                                    case '&':
104                                            replacement = "&amp;";
105    
106                                            break;
107    
108                                    case '"':
109                                            replacement = "&#034;";
110    
111                                            break;
112    
113                                    case '\'':
114                                            replacement = "&#039;";
115    
116                                            break;
117    
118                                    case '\u00bb': // '???'
119                                            replacement = "&#187;";
120    
121                                            break;
122    
123                                    case '\u2013':
124                                            replacement = "&#x2013;";
125    
126                                            break;
127    
128                                    case '\u2014':
129                                            replacement = "&#x2014;";
130    
131                                            break;
132                            }
133    
134                            if (replacement != null) {
135                                    if (sb == null) {
136                                            sb = new StringBundler();
137                                    }
138    
139                                    if (i > lastReplacementIndex) {
140                                            sb.append(text.substring(lastReplacementIndex, i));
141                                    }
142    
143                                    sb.append(replacement);
144    
145                                    lastReplacementIndex = i + 1;
146                            }
147                    }
148    
149                    if (sb == null) {
150                            return text;
151                    }
152    
153                    if (lastReplacementIndex < text.length()) {
154                            sb.append(text.substring(lastReplacementIndex));
155                    }
156    
157                    return sb.toString();
158            }
159    
160            /**
161             * Escapes the input text as a hexadecimal value, based on the mode (type).
162             * The encoding types include: {@link #ESCAPE_MODE_ATTRIBUTE}, {@link
163             * #ESCAPE_MODE_CSS}, {@link #ESCAPE_MODE_JS}, {@link #ESCAPE_MODE_TEXT},
164             * and {@link #ESCAPE_MODE_URL}.
165             *
166             * <p>
167             * Note that <code>escape(text, ESCAPE_MODE_TEXT)</code> returns the same as
168             * <code>escape(text)</code>.
169             * </p>
170             *
171             * @param  text the text to escape
172             * @param  mode the encoding type
173             * @return the escaped hexadecimal value of the input text, based on the
174             *         mode, or <code>null</code> if the text is <code>null</code>
175             */
176            @Override
177            public String escape(String text, int mode) {
178                    if (text == null) {
179                            return null;
180                    }
181    
182                    if (text.length() == 0) {
183                            return StringPool.BLANK;
184                    }
185    
186                    String prefix = StringPool.BLANK;
187                    String postfix = StringPool.BLANK;
188    
189                    if (mode == ESCAPE_MODE_ATTRIBUTE) {
190                            prefix = "&#x";
191                            postfix = StringPool.SEMICOLON;
192                    }
193                    else if (mode == ESCAPE_MODE_CSS) {
194                            prefix = StringPool.BACK_SLASH;
195                    }
196                    else if (mode == ESCAPE_MODE_JS) {
197                            prefix = "\\x";
198                    }
199                    else if (mode == ESCAPE_MODE_URL) {
200                            return HttpUtil.encodeURL(text, true);
201                    }
202                    else {
203                            return escape(text);
204                    }
205    
206                    StringBuilder sb = new StringBuilder();
207    
208                    for (int i = 0; i < text.length(); i++) {
209                            char c = text.charAt(i);
210    
211                            if ((c > 255) || Character.isLetterOrDigit(c) ||
212                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
213    
214                                    sb.append(c);
215                            }
216                            else {
217                                    sb.append(prefix);
218    
219                                    String hexString = StringUtil.toHexString(c);
220    
221                                    if (hexString.length() == 1) {
222                                            sb.append(StringPool.ASCII_TABLE[48]);
223                                    }
224    
225                                    sb.append(hexString);
226                                    sb.append(postfix);
227                            }
228                    }
229    
230                    if (sb.length() == text.length()) {
231                            return text;
232                    }
233                    else {
234                            return sb.toString();
235                    }
236            }
237    
238            /**
239             * Escapes the attribute value so that it is safe to use as an attribute
240             * value.
241             *
242             * @param  attribute the attribute to escape
243             * @return the escaped attribute value, or <code>null</code> if the
244             *         attribute value is <code>null</code>
245             */
246            @Override
247            public String escapeAttribute(String attribute) {
248                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
249            }
250    
251            /**
252             * Escapes the CSS value so that it is safe to use in a CSS context.
253             *
254             * @param  css the CSS value to escape
255             * @return the escaped CSS value, or <code>null</code> if the CSS value is
256             *         <code>null</code>
257             */
258            @Override
259            public String escapeCSS(String css) {
260                    return escape(css, ESCAPE_MODE_CSS);
261            }
262    
263            /**
264             * Escapes the HREF attribute so that it is safe to use as an HREF
265             * attribute.
266             *
267             * @param  href the HREF attribute to escape
268             * @return the escaped HREF attribute, or <code>null</code> if the HREF
269             *         attribute is <code>null</code>
270             */
271            @Override
272            public String escapeHREF(String href) {
273                    if (href == null) {
274                            return null;
275                    }
276    
277                    if (href.length() == 0) {
278                            return StringPool.BLANK;
279                    }
280    
281                    if (href.indexOf(StringPool.COLON) == 10) {
282                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
283    
284                            if (protocol.equals("javascript")) {
285                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
286                            }
287                    }
288    
289                    return escapeAttribute(href);
290            }
291    
292            /**
293             * Escapes the JavaScript value so that it is safe to use in a JavaScript
294             * context.
295             *
296             * @param  js the JavaScript value to escape
297             * @return the escaped JavaScript value, or <code>null</code> if the
298             *         JavaScript value is <code>null</code>
299             */
300            @Override
301            public String escapeJS(String js) {
302                    return escape(js, ESCAPE_MODE_JS);
303            }
304    
305            @Override
306            public String escapeJSLink(String link) {
307                    if (Validator.isNull(link)) {
308                            return StringPool.BLANK;
309                    }
310    
311                    if (link.indexOf(StringPool.COLON) == 10) {
312                            String protocol = StringUtil.toLowerCase(link.substring(0, 10));
313    
314                            if (protocol.equals("javascript")) {
315                                    link = StringUtil.replaceFirst(link, StringPool.COLON, "%3a");
316                            }
317                    }
318    
319                    return link;
320            }
321    
322            /**
323             * Escapes the URL value so that it is safe to use as a URL.
324             *
325             * @param  url the URL value to escape
326             * @return the escaped URL value, or <code>null</code> if the URL value is
327             *         <code>null</code>
328             */
329            @Override
330            public String escapeURL(String url) {
331                    return escape(url, ESCAPE_MODE_URL);
332            }
333    
334            @Override
335            public String escapeXPath(String xPath) {
336                    if (Validator.isNull(xPath)) {
337                            return xPath;
338                    }
339    
340                    StringBuilder sb = new StringBuilder(xPath.length());
341    
342                    for (int i = 0; i < xPath.length(); i++) {
343                            char c = xPath.charAt(i);
344    
345                            boolean hasToken = false;
346    
347                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
348                                    if (c == _XPATH_TOKENS[j]) {
349                                            hasToken = true;
350    
351                                            break;
352                                    }
353                            }
354    
355                            if (hasToken) {
356                                    sb.append(StringPool.UNDERLINE);
357                            }
358                            else {
359                                    sb.append(c);
360                            }
361                    }
362    
363                    return sb.toString();
364            }
365    
366            @Override
367            public String escapeXPathAttribute(String xPathAttribute) {
368                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
369                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
370    
371                    if (hasQuote && hasApostrophe) {
372                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
373    
374                            return "concat('".concat(
375                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
376                    }
377    
378                    if (hasQuote) {
379                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
380                                    StringPool.APOSTROPHE);
381                    }
382    
383                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
384            }
385    
386            /**
387             * Extracts the raw text from the HTML input, compressing its whitespace and
388             * removing all attributes, scripts, and styles.
389             *
390             * <p>
391             * For example, raw text returned by this method can be stored in a search
392             * index.
393             * </p>
394             *
395             * @param  html the HTML text
396             * @return the raw text from the HTML input, or <code>null</code> if the
397             *         HTML input is <code>null</code>
398             */
399            @Override
400            public String extractText(String html) {
401                    if (html == null) {
402                            return null;
403                    }
404    
405                    Source source = new Source(html);
406    
407                    TextExtractor textExtractor = source.getTextExtractor();
408    
409                    return textExtractor.toString();
410            }
411    
412            @Override
413            public String fromInputSafe(String text) {
414                    return StringUtil.replace(text, "&amp;", "&");
415            }
416    
417            @Override
418            public String getAUICompatibleId(String text) {
419                    if (Validator.isNull(text)) {
420                            return text;
421                    }
422    
423                    StringBundler sb = null;
424    
425                    int lastReplacementIndex = 0;
426    
427                    for (int i = 0; i < text.length(); i++) {
428                            char c = text.charAt(i);
429    
430                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
431                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
432                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
433                                     (c != CharPool.NO_BREAK_SPACE))) {
434    
435                                    continue;
436                            }
437    
438                            if (sb == null) {
439                                    sb = new StringBundler();
440                            }
441    
442                            if (i > lastReplacementIndex) {
443                                    sb.append(text.substring(lastReplacementIndex, i));
444                            }
445    
446                            sb.append(CharPool.UNDERLINE);
447    
448                            if (c != CharPool.UNDERLINE) {
449                                    sb.append(StringUtil.toHexString(c));
450                            }
451    
452                            sb.append(CharPool.UNDERLINE);
453    
454                            lastReplacementIndex = i + 1;
455                    }
456    
457                    if (sb == null) {
458                            return text;
459                    }
460    
461                    if (lastReplacementIndex < text.length()) {
462                            sb.append(text.substring(lastReplacementIndex));
463                    }
464    
465                    return sb.toString();
466            }
467    
468            /**
469             * Renders the HTML content into text. This provides a human readable
470             * version of the content that is modeled on the way Mozilla
471             * Thunderbird&reg; and other email clients provide an automatic conversion
472             * of HTML content to text in their alternative MIME encoding of emails.
473             *
474             * <p>
475             * Using the default settings, the output complies with the
476             * <code>Text/Plain; Format=Flowed (DelSp=No)</code> protocol described in
477             * <a href="http://tools.ietf.org/html/rfc3676">RFC-3676</a>.
478             * </p>
479             *
480             * @param  html the HTML text
481             * @return the rendered HTML text, or <code>null</code> if the HTML text is
482             *         <code>null</code>
483             */
484            @Override
485            public String render(String html) {
486                    if (html == null) {
487                            return null;
488                    }
489    
490                    Source source = new Source(html);
491    
492                    Renderer renderer = source.getRenderer();
493    
494                    return renderer.toString();
495            }
496    
497            /**
498             * Replaces all Microsoft&reg; Word Unicode characters with plain HTML
499             * entities or characters.
500             *
501             * @param      text the text
502             * @return     the converted text, or <code>null</code> if the text is
503             *             <code>null</code>
504             * @deprecated As of 7.0.0, with no direct replacement
505             */
506            @Deprecated
507            @Override
508            public String replaceMsWordCharacters(String text) {
509                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
510            }
511    
512            /**
513             * Replaces all new lines or carriage returns with the <code><br /></code>
514             * HTML tag.
515             *
516             * @param  html the text
517             * @return the converted text, or <code>null</code> if the text is
518             *         <code>null</code>
519             */
520            @Override
521            public String replaceNewLine(String html) {
522                    if (html == null) {
523                            return null;
524                    }
525    
526                    html = StringUtil.replace(html, StringPool.RETURN_NEW_LINE, "<br />");
527    
528                    return StringUtil.replace(html, StringPool.NEW_LINE, "<br />");
529            }
530    
531            /**
532             * Strips all content delimited by the tag out of the text.
533             *
534             * <p>
535             * If the tag appears multiple times, all occurrences (including the tag)
536             * are stripped. The tag may have attributes. In order for this method to
537             * recognize the tag, it must consist of a separate opening and closing tag.
538             * Self-closing tags remain in the result.
539             * </p>
540             *
541             * @param  text the text
542             * @param  tag the tag used for delimiting, which should only be the tag's
543             *         name (e.g. no &lt;)
544             * @return the text, without the stripped tag and its contents, or
545             *         <code>null</code> if the text is <code>null</code>
546             */
547            @Override
548            public String stripBetween(String text, String tag) {
549                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
550            }
551    
552            /**
553             * Strips all XML comments out of the text.
554             *
555             * @param  text the text
556             * @return the text, without the stripped XML comments, or <code>null</code>
557             *         if the text is <code>null</code>
558             */
559            @Override
560            public String stripComments(String text) {
561                    return StringUtil.stripBetween(text, "<!--", "-->");
562            }
563    
564            @Override
565            public String stripHtml(String text) {
566                    if (text == null) {
567                            return null;
568                    }
569    
570                    text = stripComments(text);
571    
572                    StringBuilder sb = new StringBuilder(text.length());
573    
574                    int x = 0;
575                    int y = text.indexOf("<");
576    
577                    while (y != -1) {
578                            sb.append(text.substring(x, y));
579                            sb.append(StringPool.SPACE);
580    
581                            // Look for text enclosed by <abc></abc>
582    
583                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
584                                    y = stripTag(_TAG_SCRIPT, text, y);
585                            }
586                            else if (isTag(_TAG_STYLE, text, y + 1)) {
587                                    y = stripTag(_TAG_STYLE, text, y);
588                            }
589    
590                            x = text.indexOf(">", y);
591    
592                            if (x == -1) {
593                                    break;
594                            }
595    
596                            x++;
597    
598                            if (x < y) {
599    
600                                    // <b>Hello</b
601    
602                                    break;
603                            }
604    
605                            y = text.indexOf("<", x);
606                    }
607    
608                    if (y == -1) {
609                            sb.append(text.substring(x));
610                    }
611    
612                    return sb.toString();
613            }
614    
615            /**
616             * Encodes the text so that it's safe to use as an HTML input field value.
617             *
618             * <p>
619             * For example, the <code>&</code> character is replaced by
620             * <code>&amp;amp;</code>.
621             * </p>
622             *
623             * @param  text the text
624             * @return the encoded text that is safe to use as an HTML input field
625             *         value, or <code>null</code> if the text is <code>null</code>
626             */
627            @Override
628            public String toInputSafe(String text) {
629                    return StringUtil.replace(
630                            text,
631                            new String[] {"&", "\""},
632                            new String[] {"&amp;", "&quot;"});
633            }
634    
635            @Override
636            public String unescape(String text) {
637                    return StringUtil.replace(text, "&", ";", _unescapeMap);
638            }
639    
640            @Override
641            public String unescapeCDATA(String text) {
642                    if (text == null) {
643                            return null;
644                    }
645    
646                    if (text.length() == 0) {
647                            return StringPool.BLANK;
648                    }
649    
650                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
651                    text = StringUtil.replace(text, "]]&gt;", "]]>");
652    
653                    return text;
654            }
655    
656            @Override
657            public String wordBreak(String text, int columns) {
658                    StringBundler sb = new StringBundler();
659    
660                    int length = 0;
661                    int lastWrite = 0;
662                    int pos = 0;
663    
664                    Matcher matcher = _pattern.matcher(text);
665    
666                    while (matcher.find()) {
667                            if (matcher.start() < pos) {
668                                    continue;
669                            }
670    
671                            while ((length + matcher.start() - pos) >= columns) {
672                                    pos += columns - length;
673    
674                                    sb.append(text.substring(lastWrite, pos));
675                                    sb.append("<wbr/>&shy;");
676    
677                                    length = 0;
678                                    lastWrite = pos;
679                            }
680    
681                            length += matcher.start() - pos;
682    
683                            String group = matcher.group();
684    
685                            if (group.equals(StringPool.AMPERSAND)) {
686                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
687    
688                                    if (x != -1) {
689                                            length++;
690                                            pos = x + 1;
691                                    }
692    
693                                    continue;
694                            }
695    
696                            if (group.equals(StringPool.LESS_THAN)) {
697                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
698    
699                                    if (x != -1) {
700                                            pos = x + 1;
701                                    }
702    
703                                    continue;
704                            }
705    
706                            if (group.equals(StringPool.SPACE) ||
707                                    group.equals(StringPool.NEW_LINE)) {
708    
709                                    length = 0;
710                                    pos = matcher.start() + 1;
711                            }
712                    }
713    
714                    sb.append(text.substring(lastWrite));
715    
716                    return sb.toString();
717            }
718    
719            protected boolean isTag(char[] tag, String text, int pos) {
720                    if ((pos + tag.length + 1) <= text.length()) {
721                            char item;
722    
723                            for (int i = 0; i < tag.length; i++) {
724                                    item = text.charAt(pos++);
725    
726                                    if (Character.toLowerCase(item) != tag[i]) {
727                                            return false;
728                                    }
729                            }
730    
731                            item = text.charAt(pos);
732    
733                            // Check that char after tag is not a letter (i.e. another tag)
734    
735                            return !Character.isLetter(item);
736                    }
737                    else {
738                            return false;
739                    }
740            }
741    
742            protected int stripTag(char[] tag, String text, int pos) {
743                    int x = pos + _TAG_SCRIPT.length;
744    
745                    // Find end of the tag
746    
747                    x = text.indexOf(">", x);
748    
749                    if (x < 0) {
750                            return pos;
751                    }
752    
753                    // Check if preceding character is / (i.e. is this instance of <abc/>)
754    
755                    if (text.charAt(x-1) == '/') {
756                            return pos;
757                    }
758    
759                    // Search for the ending </abc> tag
760    
761                    while (true) {
762                            x = text.indexOf("</", x);
763    
764                            if (x >= 0) {
765                                    if (isTag(tag, text, x + 2)) {
766                                            pos = x;
767    
768                                            break;
769                                    }
770                                    else {
771    
772                                            // Skip past "</"
773    
774                                            x += 2;
775                                    }
776                            }
777                            else {
778                                    break;
779                            }
780                    }
781    
782                    return pos;
783            }
784    
785            private static final String[] _MS_WORD_HTML = new String[] {
786                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
787            };
788    
789            private static final String[] _MS_WORD_UNICODE = new String[] {
790                    "\u00ae", "\u2019", "\u201c", "\u201d"
791            };
792    
793            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
794    
795            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
796    
797            // See http://www.w3.org/TR/xpath20/#lexical-structure
798    
799            private static final char[] _XPATH_TOKENS = {
800                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
801                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232
802            };
803    
804            private static final Map<String, String> _unescapeMap = new HashMap<>();
805    
806            static {
807                    _unescapeMap.put("lt", "<");
808                    _unescapeMap.put("gt", ">");
809                    _unescapeMap.put("amp", "&");
810                    _unescapeMap.put("rsquo", "\u2019");
811                    _unescapeMap.put("#034", "\"");
812                    _unescapeMap.put("#039", "'");
813                    _unescapeMap.put("#040", "(");
814                    _unescapeMap.put("#041", ")");
815                    _unescapeMap.put("#044", ",");
816                    _unescapeMap.put("#035", "#");
817                    _unescapeMap.put("#037", "%");
818                    _unescapeMap.put("#059", ";");
819                    _unescapeMap.put("#061", "=");
820                    _unescapeMap.put("#043", "+");
821                    _unescapeMap.put("#045", "-");
822            }
823    
824            private final Pattern _pattern = Pattern.compile("([\\s<&]|$)");
825    
826    }