001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * The contents of this file are subject to the terms of the Liferay Enterprise
005     * Subscription License ("License"). You may not use this file except in
006     * compliance with the License. You can obtain a copy of the License by
007     * contacting Liferay, Inc. See the License for the specific language governing
008     * permissions and limitations under the License, including but not limited to
009     * distribution rights of the Software.
010     *
011     *
012     *
013     */
014    
015    package com.liferay.portal.util;
016    
017    import static java.lang.Character.MAX_SURROGATE;
018    import static java.lang.Character.MIN_SURROGATE;
019    
020    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
021    import com.liferay.portal.kernel.util.CharPool;
022    import com.liferay.portal.kernel.util.Html;
023    import com.liferay.portal.kernel.util.HttpUtil;
024    import com.liferay.portal.kernel.util.StringBundler;
025    import com.liferay.portal.kernel.util.StringPool;
026    import com.liferay.portal.kernel.util.StringUtil;
027    import com.liferay.portal.kernel.util.Validator;
028    
029    import java.util.regex.Matcher;
030    import java.util.regex.Pattern;
031    
032    import net.htmlparser.jericho.Renderer;
033    import net.htmlparser.jericho.Source;
034    import net.htmlparser.jericho.TextExtractor;
035    
036    /**
037     * @author Brian Wing Shun Chan
038     * @author Clarence Shen
039     * @author Harry Mark
040     * @author Samuel Kong
041     * @author Connor McKay
042     * @author Shuyang Zhou
043     */
044    @DoPrivileged
045    public class HtmlImpl implements Html {
046    
047            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
048    
049            public static final int ESCAPE_MODE_CSS = 2;
050    
051            public static final int ESCAPE_MODE_JS = 3;
052    
053            public static final int ESCAPE_MODE_TEXT = 4;
054    
055            public static final int ESCAPE_MODE_URL = 5;
056    
057            /**
058             * Escapes the text so that it is safe to use in an HTML context.
059             *
060             * @param  text the text to escape
061             * @return the escaped HTML text, or <code>null</code> if the text is
062             *         <code>null</code>
063             */
064            @Override
065            public String escape(String text) {
066                    if (text == null) {
067                            return null;
068                    }
069    
070                    if (text.length() == 0) {
071                            return StringPool.BLANK;
072                    }
073    
074                    // Escape using XSS recommendations from
075                    // http://www.owasp.org/index.php/Cross_Site_Scripting
076                    // #How_to_Protect_Yourself
077    
078                    StringBundler sb = null;
079    
080                    int lastReplacementIndex = 0;
081    
082                    for (int i = 0; i < text.length(); i++) {
083                            char c = text.charAt(i);
084    
085                            String replacement = null;
086    
087                            if (c == '<') {
088                                    replacement = "&lt;";
089                            }
090                            else if (c == '>') {
091                                    replacement = "&gt;";
092                            }
093                            else if (c == '&') {
094                                    replacement = "&amp;";
095                            }
096                            else if (c == '"') {
097                                    replacement = "&#034;";
098                            }
099                            else if (c == '\'') {
100                                    replacement = "&#039;";
101                            }
102                            else if (c == '\u00bb') {
103                                    replacement = "&#187;";
104                            }
105                            else if (c == '\u2013') {
106                                    replacement = "&#x2013;";
107                            }
108                            else if (c == '\u2014') {
109                                    replacement = "&#x2014;";
110                            }
111                            else if (c == '\u2028') {
112                                    replacement = "&#x8232;";
113                            }
114                            else if (!_isValidXmlCharacter(c) ||
115                                             _isUnicodeCompatibilityCharacter(c)) {
116    
117                                    replacement = StringPool.SPACE;
118                            }
119    
120                            if (replacement != null) {
121                                    if (sb == null) {
122                                            sb = new StringBundler();
123                                    }
124    
125                                    if (i > lastReplacementIndex) {
126                                            sb.append(text.substring(lastReplacementIndex, i));
127                                    }
128    
129                                    sb.append(replacement);
130    
131                                    lastReplacementIndex = i + 1;
132                            }
133                    }
134    
135                    if (sb == null) {
136                            return text;
137                    }
138    
139                    if (lastReplacementIndex < text.length()) {
140                            sb.append(text.substring(lastReplacementIndex));
141                    }
142    
143                    return sb.toString();
144            }
145    
146            @Override
147            public String escape(String text, int type) {
148                    if (text == null) {
149                            return null;
150                    }
151    
152                    if (text.length() == 0) {
153                            return StringPool.BLANK;
154                    }
155    
156                    String prefix = StringPool.BLANK;
157                    String postfix = StringPool.BLANK;
158    
159                    if (type == ESCAPE_MODE_ATTRIBUTE) {
160                            prefix = "&#x";
161                            postfix = StringPool.SEMICOLON;
162                    }
163                    else if (type == ESCAPE_MODE_CSS) {
164                            prefix = StringPool.BACK_SLASH;
165                    }
166                    else if (type == ESCAPE_MODE_JS) {
167                            prefix = "\\x";
168                    }
169                    else if (type == ESCAPE_MODE_URL) {
170                            return HttpUtil.encodeURL(text, true);
171                    }
172                    else {
173                            return escape(text);
174                    }
175    
176                    StringBuilder sb = null;
177                    char[] hexBuffer = new char[4];
178                    int lastReplacementIndex = 0;
179    
180                    for (int i = 0; i < text.length(); i++) {
181                            char c = text.charAt(i);
182    
183                            if (c < _VALID_CHARS.length) {
184                                    if (!_VALID_CHARS[c]) {
185                                            if (sb == null) {
186                                                    sb = new StringBuilder(text.length() + 64);
187                                            }
188    
189                                            if (i > lastReplacementIndex) {
190                                                    sb.append(text, lastReplacementIndex, i);
191                                            }
192    
193                                            sb.append(prefix);
194    
195                                            _appendHexChars(sb, hexBuffer, c);
196    
197                                            sb.append(postfix);
198    
199                                            lastReplacementIndex = i + 1;
200                                    }
201                            }
202                            else if ((type == ESCAPE_MODE_ATTRIBUTE) &&
203                                             (!_isValidXmlCharacter(c) ||
204                                              _isUnicodeCompatibilityCharacter(c))) {
205    
206                                    if (sb == null) {
207                                            sb = new StringBuilder(text.length() + 64);
208                                    }
209    
210                                    if (i > lastReplacementIndex) {
211                                            sb.append(text, lastReplacementIndex, i);
212                                    }
213    
214                                    sb.append(CharPool.SPACE);
215    
216                                    lastReplacementIndex = i + 1;
217                            }
218                    }
219    
220                    if (sb == null) {
221                            return text;
222                    }
223    
224                    if (lastReplacementIndex < text.length()) {
225                            sb.append(text, lastReplacementIndex, text.length());
226                    }
227    
228                    return sb.toString();
229            }
230    
231            @Override
232            public String escapeAttribute(String attribute) {
233                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
234            }
235    
236            @Override
237            public String escapeCSS(String css) {
238                    return escape(css, ESCAPE_MODE_CSS);
239            }
240    
241            @Override
242            public String escapeHREF(String href) {
243                    if (href == null) {
244                            return null;
245                    }
246    
247                    if (href.length() == 0) {
248                            return StringPool.BLANK;
249                    }
250    
251                    int index = href.indexOf(StringPool.COLON);
252    
253                    if (index == 4) {
254                            String protocol = StringUtil.toLowerCase(href.substring(0, 4));
255    
256                            if (protocol.equals("data")) {
257                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
258                            }
259                    }
260                    else if (index == 10) {
261                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
262    
263                            if (protocol.equals("javascript")) {
264                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
265                            }
266                    }
267    
268                    return escapeAttribute(href);
269            }
270    
271            @Override
272            public String escapeJS(String js) {
273                    return escape(js, ESCAPE_MODE_JS);
274            }
275    
276            @Override
277            public String escapeURL(String url) {
278                    return escape(url, ESCAPE_MODE_URL);
279            }
280    
281            @Override
282            public String escapeXPath(String xPath) {
283                    if (Validator.isNull(xPath)) {
284                            return xPath;
285                    }
286    
287                    StringBuilder sb = new StringBuilder(xPath.length());
288    
289                    for (int i = 0; i < xPath.length(); i++) {
290                            char c = xPath.charAt(i);
291    
292                            boolean hasToken = false;
293    
294                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
295                                    if (c == _XPATH_TOKENS[j]) {
296                                            hasToken = true;
297    
298                                            break;
299                                    }
300                            }
301    
302                            if (hasToken) {
303                                    sb.append(StringPool.UNDERLINE);
304                            }
305                            else {
306                                    sb.append(c);
307                            }
308                    }
309    
310                    return sb.toString();
311            }
312    
313            @Override
314            public String escapeXPathAttribute(String xPathAttribute) {
315                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
316                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
317    
318                    if (hasQuote && hasApostrophe) {
319                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
320    
321                            return "concat('".concat(
322                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
323                    }
324    
325                    if (hasQuote) {
326                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
327                                    StringPool.APOSTROPHE);
328                    }
329    
330                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
331            }
332    
333            @Override
334            public String extractText(String html) {
335                    if (html == null) {
336                            return null;
337                    }
338    
339                    Source source = new Source(html);
340    
341                    TextExtractor textExtractor = source.getTextExtractor();
342    
343                    return textExtractor.toString();
344            }
345    
346            @Override
347            public String fromInputSafe(String text) {
348                    return StringUtil.replace(text, "&amp;", "&");
349            }
350    
351            @Override
352            public String getAUICompatibleId(String text) {
353                    if (Validator.isNull(text)) {
354                            return text;
355                    }
356    
357                    StringBundler sb = null;
358    
359                    int lastReplacementIndex = 0;
360    
361                    for (int i = 0; i < text.length(); i++) {
362                            char c = text.charAt(i);
363    
364                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
365                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
366                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
367                                     (c != CharPool.NO_BREAK_SPACE))) {
368    
369                                    continue;
370                            }
371    
372                            if (sb == null) {
373                                    sb = new StringBundler();
374                            }
375    
376                            if (i > lastReplacementIndex) {
377                                    sb.append(text.substring(lastReplacementIndex, i));
378                            }
379    
380                            sb.append(StringPool.UNDERLINE);
381    
382                            if (c != CharPool.UNDERLINE) {
383                                    sb.append(StringUtil.toHexString(c));
384                            }
385    
386                            sb.append(StringPool.UNDERLINE);
387    
388                            lastReplacementIndex = i + 1;
389                    }
390    
391                    if (sb == null) {
392                            return text;
393                    }
394    
395                    if (lastReplacementIndex < text.length()) {
396                            sb.append(text.substring(lastReplacementIndex));
397                    }
398    
399                    return sb.toString();
400            }
401    
402            @Deprecated
403            @Override
404            public String render(String html) {
405                    if (html == null) {
406                            return null;
407                    }
408    
409                    Source source = new Source(html);
410    
411                    Renderer renderer = source.getRenderer();
412    
413                    return renderer.toString();
414            }
415    
416            @Override
417            public String replaceMsWordCharacters(String text) {
418                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
419            }
420    
421            @Override
422            public String replaceNewLine(String text) {
423                    if (text == null) {
424                            return null;
425                    }
426    
427                    return text.replaceAll("\r?\n", "<br />");
428            }
429    
430            @Override
431            public String stripBetween(String text, String tag) {
432                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
433            }
434    
435            @Override
436            public String stripComments(String text) {
437                    return StringUtil.stripBetween(text, "<!--", "-->");
438            }
439    
440            @Override
441            public String stripHtml(String text) {
442                    if (text == null) {
443                            return null;
444                    }
445    
446                    text = stripComments(text);
447    
448                    StringBuilder sb = new StringBuilder(text.length());
449    
450                    int x = 0;
451                    int y = text.indexOf("<");
452    
453                    while (y != -1) {
454                            sb.append(text.substring(x, y));
455                            sb.append(StringPool.SPACE);
456    
457                            // Look for text enclosed by <abc></abc>
458    
459                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
460                                    y = stripTag(_TAG_SCRIPT, text, y);
461                            }
462                            else if (isTag(_TAG_STYLE, text, y + 1)) {
463                                    y = stripTag(_TAG_STYLE, text, y);
464                            }
465    
466                            x = text.indexOf(">", y);
467    
468                            if (x == -1) {
469                                    break;
470                            }
471    
472                            x++;
473    
474                            if (x < y) {
475    
476                                    // <b>Hello</b
477    
478                                    break;
479                            }
480    
481                            y = text.indexOf("<", x);
482                    }
483    
484                    if (y == -1) {
485                            sb.append(text.substring(x));
486                    }
487    
488                    return sb.toString();
489            }
490    
491            @Override
492            public String toInputSafe(String text) {
493                    return StringUtil.replace(
494                            text,
495                            new String[] {"&", "\""},
496                            new String[] {"&amp;", "&quot;"});
497            }
498    
499            @Override
500            public String unescape(String text) {
501                    if (text == null) {
502                            return null;
503                    }
504    
505                    if (text.length() == 0) {
506                            return StringPool.BLANK;
507                    }
508    
509                    // Optimize this
510    
511                    text = StringUtil.replace(text, "&lt;", "<");
512                    text = StringUtil.replace(text, "&gt;", ">");
513                    text = StringUtil.replace(text, "&amp;", "&");
514                    text = StringUtil.replace(text, "&rsquo;", "\u2019");
515                    text = StringUtil.replace(text, "&#034;", "\"");
516                    text = StringUtil.replace(text, "&#039;", "'");
517                    text = StringUtil.replace(text, "&#040;", "(");
518                    text = StringUtil.replace(text, "&#041;", ")");
519                    text = StringUtil.replace(text, "&#044;", ",");
520                    text = StringUtil.replace(text, "&#035;", "#");
521                    text = StringUtil.replace(text, "&#037;", "%");
522                    text = StringUtil.replace(text, "&#059;", ";");
523                    text = StringUtil.replace(text, "&#061;", "=");
524                    text = StringUtil.replace(text, "&#043;", "+");
525                    text = StringUtil.replace(text, "&#045;", "-");
526    
527                    return text;
528            }
529    
530            @Override
531            public String unescapeCDATA(String text) {
532                    if (text == null) {
533                            return null;
534                    }
535    
536                    if (text.length() == 0) {
537                            return StringPool.BLANK;
538                    }
539    
540                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
541                    text = StringUtil.replace(text, "]]&gt;", "]]>");
542    
543                    return text;
544            }
545    
546            @Override
547            public String wordBreak(String text, int columns) {
548                    StringBundler sb = new StringBundler();
549    
550                    int length = 0;
551                    int lastWrite = 0;
552                    int pos = 0;
553    
554                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
555    
556                    Matcher matcher = pattern.matcher(text);
557    
558                    while (matcher.find()) {
559                            if (matcher.start() < pos) {
560                                    continue;
561                            }
562    
563                            while ((length + matcher.start() - pos) >= columns) {
564                                    pos += columns - length;
565    
566                                    sb.append(text.substring(lastWrite, pos));
567                                    sb.append("<wbr/>&shy;");
568    
569                                    length = 0;
570                                    lastWrite = pos;
571                            }
572    
573                            length += matcher.start() - pos;
574    
575                            String group = matcher.group();
576    
577                            if (group.equals(StringPool.AMPERSAND)) {
578                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
579    
580                                    if (x != -1) {
581                                            length++;
582                                            pos = x + 1;
583                                    }
584    
585                                    continue;
586                            }
587    
588                            if (group.equals(StringPool.LESS_THAN)) {
589                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
590    
591                                    if (x != -1) {
592                                            pos = x + 1;
593                                    }
594    
595                                    continue;
596                            }
597    
598                            if (group.equals(StringPool.SPACE) ||
599                                    group.equals(StringPool.NEW_LINE)) {
600    
601                                    length = 0;
602                                    pos = matcher.start() + 1;
603                            }
604                    }
605    
606                    sb.append(text.substring(lastWrite));
607    
608                    return sb.toString();
609            }
610    
611            protected boolean isTag(char[] tag, String text, int pos) {
612                    if ((pos + tag.length + 1) <= text.length()) {
613                            char item;
614    
615                            for (int i = 0; i < tag.length; i++) {
616                                    item = text.charAt(pos++);
617    
618                                    if (Character.toLowerCase(item) != tag[i]) {
619                                            return false;
620                                    }
621                            }
622    
623                            item = text.charAt(pos);
624    
625                            // Check that char after tag is not a letter (i.e. another tag)
626    
627                            return !Character.isLetter(item);
628                    }
629                    else {
630                            return false;
631                    }
632            }
633    
634            protected int stripTag(char[] tag, String text, int pos) {
635                    int x = pos + _TAG_SCRIPT.length;
636    
637                    // Find end of the tag
638    
639                    x = text.indexOf(">", x);
640    
641                    if (x < 0) {
642                            return pos;
643                    }
644    
645                    // Check if preceding character is / (i.e. is this instance of <abc/>)
646    
647                    if (text.charAt(x-1) == '/') {
648                            return pos;
649                    }
650    
651                    // Search for the ending </abc> tag
652    
653                    while (true) {
654                            x = text.indexOf("</", x);
655    
656                            if (x >= 0) {
657                                    if (isTag(tag, text, x + 2)) {
658                                            pos = x;
659    
660                                            break;
661                                    }
662                                    else {
663    
664                                            // Skip past "</"
665    
666                                            x += 2;
667                                    }
668                            }
669                            else {
670                                    break;
671                            }
672                    }
673    
674                    return pos;
675            }
676    
677            private static void _appendHexChars(
678                    StringBuilder sb, char[] buffer, char c) {
679    
680                    int index = buffer.length;
681    
682                    do {
683                            buffer[--index] = _HEX_DIGITS[c & 15];
684    
685                            c >>>= 4;
686                    }
687                    while (c != 0);
688    
689                    if (index == (buffer.length - 1)) {
690                            sb.append(CharPool.NUMBER_0);
691                            sb.append(buffer[index]);
692    
693                            return;
694                    }
695    
696                    sb.append(buffer, index, buffer.length - index);
697            }
698    
699            private boolean _isUnicodeCompatibilityCharacter(char c) {
700                    if (((c >= '\u007f') && (c <= '\u0084')) ||
701                            ((c >= '\u0086') && (c <= '\u009f')) ||
702                            ((c >= '\ufdd0') && (c <= '\ufdef'))) {
703    
704                            return true;
705                    }
706    
707                    return false;
708            }
709    
710            private boolean _isValidXmlCharacter(char c) {
711                    if (((c >= CharPool.SPACE) && (c <= '\ud7ff')) ||
712                            ((c >= '\ue000') && (c <= '\ufffd')) ||
713                            ((c >= MIN_SURROGATE) && (c < (MAX_SURROGATE + 1))) ||
714                            (c == CharPool.TAB) || (c == CharPool.NEW_LINE) ||
715                            (c == CharPool.RETURN)) {
716    
717                            return true;
718                    }
719    
720                    return false;
721            }
722    
723            private static final char[] _HEX_DIGITS = {
724                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd',
725                    'e', 'f'
726            };
727    
728            private static final String[] _MS_WORD_HTML = new String[] {
729                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
730            };
731    
732            private static final String[] _MS_WORD_UNICODE = new String[] {
733                    "\u00ae", "\u2019", "\u201c", "\u201d"
734            };
735    
736            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
737    
738            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
739    
740            private static final boolean[] _VALID_CHARS = new boolean[128];
741    
742            // See http://www.w3.org/TR/xpath20/#lexical-structure
743    
744            private static final char[] _XPATH_TOKENS = {
745                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
746                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
747    
748            static {
749                    for (int i = 'a'; i <= 'z'; i++) {
750                            _VALID_CHARS[i] = true;
751                    }
752    
753                    for (int i = 'A'; i <= 'Z'; i++) {
754                            _VALID_CHARS[i] = true;
755                    }
756    
757                    for (int i = '0'; i <= '9'; i++) {
758                            _VALID_CHARS[i] = true;
759                    }
760    
761                    _VALID_CHARS['-'] = true;
762                    _VALID_CHARS['_'] = true;
763            }
764    
765    }