001    /**
002     * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.HashMap;
027    import java.util.Map;
028    import java.util.regex.Matcher;
029    import java.util.regex.Pattern;
030    
031    import net.htmlparser.jericho.Renderer;
032    import net.htmlparser.jericho.Source;
033    import net.htmlparser.jericho.TextExtractor;
034    
035    /**
036     * Provides the implementation of the HTML utility interface for escaping,
037     * rendering, replacing, and stripping HTML text. This class uses XSS
038     * recommendations from <a
039     * href="http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself">http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself</a>
040     * when escaping HTML text.
041     *
042     * @author Brian Wing Shun Chan
043     * @author Clarence Shen
044     * @author Harry Mark
045     * @author Samuel Kong
046     * @author Connor McKay
047     * @author Shuyang Zhou
048     */
049    @DoPrivileged
050    public class HtmlImpl implements Html {
051    
052            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
053    
054            public static final int ESCAPE_MODE_CSS = 2;
055    
056            public static final int ESCAPE_MODE_JS = 3;
057    
058            public static final int ESCAPE_MODE_TEXT = 4;
059    
060            public static final int ESCAPE_MODE_URL = 5;
061    
062            /**
063             * Generates a string with the data-* attributes generated from the keys and
064             * values of a map. For example, a map containing
065             * <code>{key1=value1;key2=value2}</code> is returned as the string
066             * <code>data-key1=value1 data-key2=value2</code>.
067             *
068             * @param  data the map of values to convert to data-* attributes
069             * @return a string with the data attributes, or <code>null</code> if the
070             *         map is <code>null</code>
071             */
072            @Override
073            public String buildData(Map<String, Object> data) {
074                    if ((data == null) || data.isEmpty()) {
075                            return StringPool.BLANK;
076                    }
077    
078                    StringBundler sb = new StringBundler(data.size() * 5);
079    
080                    for (Map.Entry<String, Object> entry : data.entrySet()) {
081                            sb.append("data-");
082                            sb.append(entry.getKey());
083                            sb.append("=\"");
084                            sb.append(escapeAttribute(String.valueOf(entry.getValue())));
085                            sb.append("\" ");
086                    }
087    
088                    return sb.toString();
089            }
090    
091            /**
092             * Escapes the text so that it is safe to use in an HTML context.
093             *
094             * @param  text the text to escape
095             * @return the escaped HTML text, or <code>null</code> if the text is
096             *         <code>null</code>
097             */
098            @Override
099            public String escape(String text) {
100                    if (text == null) {
101                            return null;
102                    }
103    
104                    if (text.length() == 0) {
105                            return StringPool.BLANK;
106                    }
107    
108                    // Escape using XSS recommendations from
109                    // http://www.owasp.org/index.php/Cross_Site_Scripting
110                    // #How_to_Protect_Yourself
111    
112                    return StringUtil.replace(
113                            text,
114                            new char[] {
115                                    '<', '>', '&', '"', '\'', '\u00bb', '\u2013', '\u2014', '\u2028'
116                            },
117                            new String[] {
118                                    "&lt;", "&gt;", "&amp;", "&#034;", "&#039;", "&#187;",
119                                    "&#x2013;", "&#x2014;", "&#x2028;"
120                            });
121            }
122    
123            /**
124             * Escapes the input text as a hexadecimal value, based on the mode (type).
125             * The encoding types include: {@link #ESCAPE_MODE_ATTRIBUTE}, {@link
126             * #ESCAPE_MODE_CSS}, {@link #ESCAPE_MODE_JS}, {@link #ESCAPE_MODE_TEXT},
127             * and {@link #ESCAPE_MODE_URL}.
128             *
129             * <p>
130             * Note that <code>escape(text, ESCAPE_MODE_TEXT)</code> returns the same as
131             * <code>escape(text)</code>.
132             * </p>
133             *
134             * @param  text the text to escape
135             * @param  mode the encoding type
136             * @return the escaped hexadecimal value of the input text, based on the
137             *         mode, or <code>null</code> if the text is <code>null</code>
138             */
139            @Override
140            public String escape(String text, int mode) {
141                    if (text == null) {
142                            return null;
143                    }
144    
145                    if (text.length() == 0) {
146                            return StringPool.BLANK;
147                    }
148    
149                    String prefix = StringPool.BLANK;
150                    String postfix = StringPool.BLANK;
151    
152                    if (mode == ESCAPE_MODE_ATTRIBUTE) {
153                            prefix = "&#x";
154                            postfix = StringPool.SEMICOLON;
155                    }
156                    else if (mode == ESCAPE_MODE_CSS) {
157                            prefix = StringPool.BACK_SLASH;
158                    }
159                    else if (mode == ESCAPE_MODE_JS) {
160                            prefix = "\\x";
161                    }
162                    else if (mode == ESCAPE_MODE_URL) {
163                            return HttpUtil.encodeURL(text, true);
164                    }
165                    else {
166                            return escape(text);
167                    }
168    
169                    StringBuilder sb = new StringBuilder(text.length());
170    
171                    for (int i = 0; i < text.length(); i++) {
172                            char c = text.charAt(i);
173    
174                            if ((c > 255) || (c == CharPool.DASH) ||
175                                    (c == CharPool.UNDERLINE) || Character.isLetterOrDigit(c)) {
176    
177                                    sb.append(c);
178                            }
179                            else {
180                                    sb.append(prefix);
181    
182                                    String hexString = StringUtil.toHexString(c);
183    
184                                    if (hexString.length() == 1) {
185                                            sb.append(StringPool.ASCII_TABLE[48]);
186                                    }
187    
188                                    sb.append(hexString);
189                                    sb.append(postfix);
190                            }
191                    }
192    
193                    if (sb.length() == text.length()) {
194                            return text;
195                    }
196    
197                    return sb.toString();
198            }
199    
200            /**
201             * Escapes the attribute value so that it is safe to use as an attribute
202             * value.
203             *
204             * @param  attribute the attribute to escape
205             * @return the escaped attribute value, or <code>null</code> if the
206             *         attribute value is <code>null</code>
207             */
208            @Override
209            public String escapeAttribute(String attribute) {
210                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
211            }
212    
213            /**
214             * Escapes the CSS value so that it is safe to use in a CSS context.
215             *
216             * @param  css the CSS value to escape
217             * @return the escaped CSS value, or <code>null</code> if the CSS value is
218             *         <code>null</code>
219             */
220            @Override
221            public String escapeCSS(String css) {
222                    return escape(css, ESCAPE_MODE_CSS);
223            }
224    
225            /**
226             * Escapes the HREF attribute so that it is safe to use as an HREF
227             * attribute.
228             *
229             * @param  href the HREF attribute to escape
230             * @return the escaped HREF attribute, or <code>null</code> if the HREF
231             *         attribute is <code>null</code>
232             */
233            @Override
234            public String escapeHREF(String href) {
235                    if (href == null) {
236                            return null;
237                    }
238    
239                    if (href.length() == 0) {
240                            return StringPool.BLANK;
241                    }
242    
243                    int index = href.indexOf(StringPool.COLON);
244    
245                    if (index == 4) {
246                            String protocol = StringUtil.toLowerCase(href.substring(0, 4));
247    
248                            if (protocol.equals("data")) {
249                                    href = StringUtil.replaceFirst(href, CharPool.COLON, "%3a");
250                            }
251                    }
252                    else if (index == 10) {
253                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
254    
255                            if (protocol.equals("javascript")) {
256                                    href = StringUtil.replaceFirst(href, CharPool.COLON, "%3a");
257                            }
258                    }
259    
260                    return escapeAttribute(href);
261            }
262    
263            /**
264             * Escapes the JavaScript value so that it is safe to use in a JavaScript
265             * context.
266             *
267             * @param  js the JavaScript value to escape
268             * @return the escaped JavaScript value, or <code>null</code> if the
269             *         JavaScript value is <code>null</code>
270             */
271            @Override
272            public String escapeJS(String js) {
273                    return escape(js, ESCAPE_MODE_JS);
274            }
275    
276            @Override
277            public String escapeJSLink(String link) {
278                    if (Validator.isNull(link)) {
279                            return StringPool.BLANK;
280                    }
281    
282                    if (link.indexOf(StringPool.COLON) == 10) {
283                            String protocol = StringUtil.toLowerCase(link.substring(0, 10));
284    
285                            if (protocol.equals("javascript")) {
286                                    link = StringUtil.replaceFirst(link, CharPool.COLON, "%3a");
287                            }
288                    }
289    
290                    return link;
291            }
292    
293            /**
294             * Escapes the URL value so that it is safe to use as a URL.
295             *
296             * @param  url the URL value to escape
297             * @return the escaped URL value, or <code>null</code> if the URL value is
298             *         <code>null</code>
299             */
300            @Override
301            public String escapeURL(String url) {
302                    return escape(url, ESCAPE_MODE_URL);
303            }
304    
305            @Override
306            public String escapeXPath(String xPath) {
307                    if (Validator.isNull(xPath)) {
308                            return xPath;
309                    }
310    
311                    StringBuilder sb = new StringBuilder(xPath.length());
312    
313                    for (int i = 0; i < xPath.length(); i++) {
314                            char c = xPath.charAt(i);
315    
316                            boolean hasToken = false;
317    
318                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
319                                    if (c == _XPATH_TOKENS[j]) {
320                                            hasToken = true;
321    
322                                            break;
323                                    }
324                            }
325    
326                            if (hasToken) {
327                                    sb.append(StringPool.UNDERLINE);
328                            }
329                            else {
330                                    sb.append(c);
331                            }
332                    }
333    
334                    return sb.toString();
335            }
336    
337            @Override
338            public String escapeXPathAttribute(String xPathAttribute) {
339                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
340                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
341    
342                    if (hasQuote && hasApostrophe) {
343                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
344    
345                            return "concat('".concat(
346                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
347                    }
348    
349                    if (hasQuote) {
350                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
351                                    StringPool.APOSTROPHE);
352                    }
353    
354                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
355            }
356    
357            /**
358             * Extracts the raw text from the HTML input, compressing its whitespace and
359             * removing all attributes, scripts, and styles.
360             *
361             * <p>
362             * For example, raw text returned by this method can be stored in a search
363             * index.
364             * </p>
365             *
366             * @param  html the HTML text
367             * @return the raw text from the HTML input, or <code>null</code> if the
368             *         HTML input is <code>null</code>
369             */
370            @Override
371            public String extractText(String html) {
372                    if (html == null) {
373                            return null;
374                    }
375    
376                    Source source = new Source(html);
377    
378                    TextExtractor textExtractor = source.getTextExtractor();
379    
380                    return textExtractor.toString();
381            }
382    
383            @Override
384            public String fromInputSafe(String text) {
385                    return StringUtil.replace(text, "&amp;", "&");
386            }
387    
388            @Override
389            public String getAUICompatibleId(String text) {
390                    if (Validator.isNull(text)) {
391                            return text;
392                    }
393    
394                    StringBundler sb = null;
395    
396                    int lastReplacementIndex = 0;
397    
398                    for (int i = 0; i < text.length(); i++) {
399                            char c = text.charAt(i);
400    
401                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
402                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
403                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
404                                     (c != CharPool.NO_BREAK_SPACE))) {
405    
406                                    continue;
407                            }
408    
409                            if (sb == null) {
410                                    sb = new StringBundler();
411                            }
412    
413                            if (i > lastReplacementIndex) {
414                                    sb.append(text.substring(lastReplacementIndex, i));
415                            }
416    
417                            sb.append(CharPool.UNDERLINE);
418    
419                            if (c != CharPool.UNDERLINE) {
420                                    sb.append(StringUtil.toHexString(c));
421                            }
422    
423                            sb.append(CharPool.UNDERLINE);
424    
425                            lastReplacementIndex = i + 1;
426                    }
427    
428                    if (sb == null) {
429                            return text;
430                    }
431    
432                    if (lastReplacementIndex < text.length()) {
433                            sb.append(text.substring(lastReplacementIndex));
434                    }
435    
436                    return sb.toString();
437            }
438    
439            /**
440             * Renders the HTML content into text. This provides a human readable
441             * version of the content that is modeled on the way Mozilla
442             * Thunderbird&reg; and other email clients provide an automatic conversion
443             * of HTML content to text in their alternative MIME encoding of emails.
444             *
445             * <p>
446             * Using the default settings, the output complies with the
447             * <code>Text/Plain; Format=Flowed (DelSp=No)</code> protocol described in
448             * <a href="http://tools.ietf.org/html/rfc3676">RFC-3676</a>.
449             * </p>
450             *
451             * @param  html the HTML text
452             * @return the rendered HTML text, or <code>null</code> if the HTML text is
453             *         <code>null</code>
454             */
455            @Override
456            public String render(String html) {
457                    if (html == null) {
458                            return null;
459                    }
460    
461                    Source source = new Source(html);
462    
463                    Renderer renderer = source.getRenderer();
464    
465                    return renderer.toString();
466            }
467    
468            /**
469             * Replaces all Microsoft&reg; Word Unicode characters with plain HTML
470             * entities or characters.
471             *
472             * @param      text the text
473             * @return     the converted text, or <code>null</code> if the text is
474             *             <code>null</code>
475             * @deprecated As of 7.0.0, with no direct replacement
476             */
477            @Deprecated
478            @Override
479            public String replaceMsWordCharacters(String text) {
480                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
481            }
482    
483            /**
484             * Replaces all new lines or carriage returns with the <code><br /></code>
485             * HTML tag.
486             *
487             * @param  html the text
488             * @return the converted text, or <code>null</code> if the text is
489             *         <code>null</code>
490             */
491            @Override
492            public String replaceNewLine(String html) {
493                    if (html == null) {
494                            return null;
495                    }
496    
497                    html = StringUtil.replace(html, StringPool.RETURN_NEW_LINE, "<br />");
498    
499                    return StringUtil.replace(html, CharPool.NEW_LINE, "<br />");
500            }
501    
502            /**
503             * Strips all content delimited by the tag out of the text.
504             *
505             * <p>
506             * If the tag appears multiple times, all occurrences (including the tag)
507             * are stripped. The tag may have attributes. In order for this method to
508             * recognize the tag, it must consist of a separate opening and closing tag.
509             * Self-closing tags remain in the result.
510             * </p>
511             *
512             * @param  text the text
513             * @param  tag the tag used for delimiting, which should only be the tag's
514             *         name (e.g. no &lt;)
515             * @return the text, without the stripped tag and its contents, or
516             *         <code>null</code> if the text is <code>null</code>
517             */
518            @Override
519            public String stripBetween(String text, String tag) {
520                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
521            }
522    
523            /**
524             * Strips all XML comments out of the text.
525             *
526             * @param  text the text
527             * @return the text, without the stripped XML comments, or <code>null</code>
528             *         if the text is <code>null</code>
529             */
530            @Override
531            public String stripComments(String text) {
532                    return StringUtil.stripBetween(text, "<!--", "-->");
533            }
534    
535            @Override
536            public String stripHtml(String text) {
537                    if (text == null) {
538                            return null;
539                    }
540    
541                    text = stripComments(text);
542    
543                    StringBuilder sb = new StringBuilder(text.length());
544    
545                    int x = 0;
546                    int y = text.indexOf("<");
547    
548                    while (y != -1) {
549                            sb.append(text.substring(x, y));
550                            sb.append(StringPool.SPACE);
551    
552                            // Look for text enclosed by <abc></abc>
553    
554                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
555                                    y = stripTag(_TAG_SCRIPT, text, y);
556                            }
557                            else if (isTag(_TAG_STYLE, text, y + 1)) {
558                                    y = stripTag(_TAG_STYLE, text, y);
559                            }
560    
561                            x = text.indexOf(">", y);
562    
563                            if (x == -1) {
564                                    break;
565                            }
566    
567                            x++;
568    
569                            if (x < y) {
570    
571                                    // <b>Hello</b
572    
573                                    break;
574                            }
575    
576                            y = text.indexOf("<", x);
577                    }
578    
579                    if (y == -1) {
580                            sb.append(text.substring(x));
581                    }
582    
583                    return sb.toString();
584            }
585    
586            /**
587             * Encodes the text so that it's safe to use as an HTML input field value.
588             *
589             * <p>
590             * For example, the <code>&</code> character is replaced by
591             * <code>&amp;amp;</code>.
592             * </p>
593             *
594             * @param  text the text
595             * @return the encoded text that is safe to use as an HTML input field
596             *         value, or <code>null</code> if the text is <code>null</code>
597             */
598            @Override
599            public String toInputSafe(String text) {
600                    return StringUtil.replace(
601                            text, new char[] {'&', '\"'}, new String[] {"&amp;", "&quot;"});
602            }
603    
604            @Override
605            public String unescape(String text) {
606                    return StringUtil.replace(text, "&", ";", _unescapeMap);
607            }
608    
609            @Override
610            public String unescapeCDATA(String text) {
611                    if (text == null) {
612                            return null;
613                    }
614    
615                    if (text.length() == 0) {
616                            return StringPool.BLANK;
617                    }
618    
619                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
620                    text = StringUtil.replace(text, "]]&gt;", "]]>");
621    
622                    return text;
623            }
624    
625            @Override
626            public String wordBreak(String text, int columns) {
627                    StringBundler sb = new StringBundler();
628    
629                    int length = 0;
630                    int lastWrite = 0;
631                    int pos = 0;
632    
633                    Matcher matcher = _pattern.matcher(text);
634    
635                    while (matcher.find()) {
636                            if (matcher.start() < pos) {
637                                    continue;
638                            }
639    
640                            while ((length + matcher.start() - pos) >= columns) {
641                                    pos += columns - length;
642    
643                                    sb.append(text.substring(lastWrite, pos));
644                                    sb.append("<wbr/>&shy;");
645    
646                                    length = 0;
647                                    lastWrite = pos;
648                            }
649    
650                            length += matcher.start() - pos;
651    
652                            String group = matcher.group();
653    
654                            if (group.equals(StringPool.AMPERSAND)) {
655                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
656    
657                                    if (x != -1) {
658                                            length++;
659                                            pos = x + 1;
660                                    }
661    
662                                    continue;
663                            }
664    
665                            if (group.equals(StringPool.LESS_THAN)) {
666                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
667    
668                                    if (x != -1) {
669                                            pos = x + 1;
670                                    }
671    
672                                    continue;
673                            }
674    
675                            if (group.equals(StringPool.SPACE) ||
676                                    group.equals(StringPool.NEW_LINE)) {
677    
678                                    length = 0;
679                                    pos = matcher.start() + 1;
680                            }
681                    }
682    
683                    sb.append(text.substring(lastWrite));
684    
685                    return sb.toString();
686            }
687    
688            protected boolean isTag(char[] tag, String text, int pos) {
689                    if ((pos + tag.length + 1) <= text.length()) {
690                            char item = '\0';
691    
692                            for (int i = 0; i < tag.length; i++) {
693                                    item = text.charAt(pos++);
694    
695                                    if (Character.toLowerCase(item) != tag[i]) {
696                                            return false;
697                                    }
698                            }
699    
700                            item = text.charAt(pos);
701    
702                            // Check that char after tag is not a letter (i.e. another tag)
703    
704                            return !Character.isLetter(item);
705                    }
706                    else {
707                            return false;
708                    }
709            }
710    
711            protected int stripTag(char[] tag, String text, int pos) {
712                    int x = pos + _TAG_SCRIPT.length;
713    
714                    // Find end of the tag
715    
716                    x = text.indexOf(">", x);
717    
718                    if (x < 0) {
719                            return pos;
720                    }
721    
722                    // Check if preceding character is / (i.e. is this instance of <abc/>)
723    
724                    if (text.charAt(x-1) == '/') {
725                            return pos;
726                    }
727    
728                    // Search for the ending </abc> tag
729    
730                    while (true) {
731                            x = text.indexOf("</", x);
732    
733                            if (x >= 0) {
734                                    if (isTag(tag, text, x + 2)) {
735                                            pos = x;
736    
737                                            break;
738                                    }
739                                    else {
740    
741                                            // Skip past "</"
742    
743                                            x += 2;
744                                    }
745                            }
746                            else {
747                                    break;
748                            }
749                    }
750    
751                    return pos;
752            }
753    
754            private static final String[] _MS_WORD_HTML = new String[] {
755                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
756            };
757    
758            private static final String[] _MS_WORD_UNICODE =
759                    new String[] {"\u00ae", "\u2019", "\u201c", "\u201d"};
760    
761            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
762    
763            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
764    
765            // See http://www.w3.org/TR/xpath20/#lexical-structure
766    
767            private static final char[] _XPATH_TOKENS = {
768                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
769                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232
770            };
771    
772            private static final Map<String, String> _unescapeMap = new HashMap<>();
773    
774            static {
775                    _unescapeMap.put("lt", "<");
776                    _unescapeMap.put("gt", ">");
777                    _unescapeMap.put("amp", "&");
778                    _unescapeMap.put("rsquo", "\u2019");
779                    _unescapeMap.put("#034", "\"");
780                    _unescapeMap.put("#039", "'");
781                    _unescapeMap.put("#040", "(");
782                    _unescapeMap.put("#041", ")");
783                    _unescapeMap.put("#044", ",");
784                    _unescapeMap.put("#035", "#");
785                    _unescapeMap.put("#037", "%");
786                    _unescapeMap.put("#059", ";");
787                    _unescapeMap.put("#061", "=");
788                    _unescapeMap.put("#043", "+");
789                    _unescapeMap.put("#045", "-");
790            }
791    
792            private final Pattern _pattern = Pattern.compile("([\\s<&]|$)");
793    
794    }