001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * The contents of this file are subject to the terms of the Liferay Enterprise
005     * Subscription License ("License"). You may not use this file except in
006     * compliance with the License. You can obtain a copy of the License by
007     * contacting Liferay, Inc. See the License for the specific language governing
008     * permissions and limitations under the License, including but not limited to
009     * distribution rights of the Software.
010     *
011     *
012     *
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            /**
055             * Escapes the text so that it is safe to use in an HTML context.
056             *
057             * @param  text the text to escape
058             * @return the escaped HTML text, or <code>null</code> if the text is
059             *         <code>null</code>
060             */
061            @Override
062            public String escape(String text) {
063                    if (text == null) {
064                            return null;
065                    }
066    
067                    if (text.length() == 0) {
068                            return StringPool.BLANK;
069                    }
070    
071                    // Escape using XSS recommendations from
072                    // http://www.owasp.org/index.php/Cross_Site_Scripting
073                    // #How_to_Protect_Yourself
074    
075                    StringBundler sb = null;
076    
077                    int lastReplacementIndex = 0;
078    
079                    for (int i = 0; i < text.length(); i++) {
080                            char c = text.charAt(i);
081    
082                            String replacement = null;
083    
084                            switch (c) {
085                                    case '<':
086                                            replacement = "&lt;";
087    
088                                            break;
089    
090                                    case '>':
091                                            replacement = "&gt;";
092    
093                                            break;
094    
095                                    case '&':
096                                            replacement = "&amp;";
097    
098                                            break;
099    
100                                    case '"':
101                                            replacement = "&#034;";
102    
103                                            break;
104    
105                                    case '\'':
106                                            replacement = "&#039;";
107    
108                                            break;
109    
110                                    case '\u00bb': // '???'
111                                            replacement = "&#187;";
112    
113                                            break;
114    
115                                    case '\u2013':
116                                            replacement = "&#x2013;";
117    
118                                            break;
119    
120                                    case '\u2014':
121                                            replacement = "&#x2014;";
122    
123                                            break;
124    
125                                    case '\u2028':
126                                            replacement = "&#x8232;";
127    
128                                            break;
129                            }
130    
131                            if (replacement != null) {
132                                    if (sb == null) {
133                                            sb = new StringBundler();
134                                    }
135    
136                                    if (i > lastReplacementIndex) {
137                                            sb.append(text.substring(lastReplacementIndex, i));
138                                    }
139    
140                                    sb.append(replacement);
141    
142                                    lastReplacementIndex = i + 1;
143                            }
144                    }
145    
146                    if (sb == null) {
147                            return text;
148                    }
149    
150                    if (lastReplacementIndex < text.length()) {
151                            sb.append(text.substring(lastReplacementIndex));
152                    }
153    
154                    return sb.toString();
155            }
156    
157            @Override
158            public String escape(String text, int type) {
159                    if (text == null) {
160                            return null;
161                    }
162    
163                    if (text.length() == 0) {
164                            return StringPool.BLANK;
165                    }
166    
167                    String prefix = StringPool.BLANK;
168                    String postfix = StringPool.BLANK;
169    
170                    if (type == ESCAPE_MODE_ATTRIBUTE) {
171                            prefix = "&#x";
172                            postfix = StringPool.SEMICOLON;
173                    }
174                    else if (type == ESCAPE_MODE_CSS) {
175                            prefix = StringPool.BACK_SLASH;
176                    }
177                    else if (type == ESCAPE_MODE_JS) {
178                            prefix = "\\x";
179                    }
180                    else if (type == ESCAPE_MODE_URL) {
181                            return HttpUtil.encodeURL(text, true);
182                    }
183                    else {
184                            return escape(text);
185                    }
186    
187                    StringBuilder sb = new StringBuilder();
188    
189                    for (int i = 0; i < text.length(); i++) {
190                            char c = text.charAt(i);
191    
192                            if ((c > 255) || Character.isLetterOrDigit(c) ||
193                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
194    
195                                    sb.append(c);
196                            }
197                            else {
198                                    sb.append(prefix);
199    
200                                    String hexString = StringUtil.toHexString(c);
201    
202                                    if (hexString.length() == 1) {
203                                            sb.append(StringPool.ASCII_TABLE[48]);
204                                    }
205    
206                                    sb.append(hexString);
207                                    sb.append(postfix);
208                            }
209                    }
210    
211                    if (sb.length() == text.length()) {
212                            return text;
213                    }
214                    else {
215                            return sb.toString();
216                    }
217            }
218    
219            @Override
220            public String escapeAttribute(String attribute) {
221                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
222            }
223    
224            @Override
225            public String escapeCSS(String css) {
226                    return escape(css, ESCAPE_MODE_CSS);
227            }
228    
229            @Override
230            public String escapeHREF(String href) {
231                    if (href == null) {
232                            return null;
233                    }
234    
235                    if (href.length() == 0) {
236                            return StringPool.BLANK;
237                    }
238    
239                    int index = href.indexOf(StringPool.COLON);
240    
241                    if (index == 4) {
242                            String protocol = StringUtil.toLowerCase(href.substring(0, 4));
243    
244                            if (protocol.equals("data")) {
245                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
246                            }
247                    }
248                    else if (index == 10) {
249                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
250    
251                            if (protocol.equals("javascript")) {
252                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
253                            }
254                    }
255    
256                    return escapeAttribute(href);
257            }
258    
259            @Override
260            public String escapeJS(String js) {
261                    return escape(js, ESCAPE_MODE_JS);
262            }
263    
264            @Override
265            public String escapeURL(String url) {
266                    return escape(url, ESCAPE_MODE_URL);
267            }
268    
269            @Override
270            public String escapeXPath(String xPath) {
271                    if (Validator.isNull(xPath)) {
272                            return xPath;
273                    }
274    
275                    StringBuilder sb = new StringBuilder(xPath.length());
276    
277                    for (int i = 0; i < xPath.length(); i++) {
278                            char c = xPath.charAt(i);
279    
280                            boolean hasToken = false;
281    
282                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
283                                    if (c == _XPATH_TOKENS[j]) {
284                                            hasToken = true;
285    
286                                            break;
287                                    }
288                            }
289    
290                            if (hasToken) {
291                                    sb.append(StringPool.UNDERLINE);
292                            }
293                            else {
294                                    sb.append(c);
295                            }
296                    }
297    
298                    return sb.toString();
299            }
300    
301            @Override
302            public String escapeXPathAttribute(String xPathAttribute) {
303                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
304                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
305    
306                    if (hasQuote && hasApostrophe) {
307                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
308    
309                            return "concat('".concat(
310                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
311                    }
312    
313                    if (hasQuote) {
314                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
315                                    StringPool.APOSTROPHE);
316                    }
317    
318                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
319            }
320    
321            @Override
322            public String extractText(String html) {
323                    if (html == null) {
324                            return null;
325                    }
326    
327                    Source source = new Source(html);
328    
329                    TextExtractor textExtractor = source.getTextExtractor();
330    
331                    return textExtractor.toString();
332            }
333    
334            @Override
335            public String fromInputSafe(String text) {
336                    return StringUtil.replace(text, "&amp;", "&");
337            }
338    
339            @Override
340            public String getAUICompatibleId(String text) {
341                    if (Validator.isNull(text)) {
342                            return text;
343                    }
344    
345                    StringBundler sb = null;
346    
347                    int lastReplacementIndex = 0;
348    
349                    for (int i = 0; i < text.length(); i++) {
350                            char c = text.charAt(i);
351    
352                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
353                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
354                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
355                                     (c != CharPool.NO_BREAK_SPACE))) {
356    
357                                    continue;
358                            }
359    
360                            if (sb == null) {
361                                    sb = new StringBundler();
362                            }
363    
364                            if (i > lastReplacementIndex) {
365                                    sb.append(text.substring(lastReplacementIndex, i));
366                            }
367    
368                            sb.append(CharPool.UNDERLINE);
369    
370                            if (c != CharPool.UNDERLINE) {
371                                    sb.append(StringUtil.toHexString(c));
372                            }
373    
374                            sb.append(CharPool.UNDERLINE);
375    
376                            lastReplacementIndex = i + 1;
377                    }
378    
379                    if (sb == null) {
380                            return text;
381                    }
382    
383                    if (lastReplacementIndex < text.length()) {
384                            sb.append(text.substring(lastReplacementIndex));
385                    }
386    
387                    return sb.toString();
388            }
389    
390            @Deprecated
391            @Override
392            public String render(String html) {
393                    if (html == null) {
394                            return null;
395                    }
396    
397                    Source source = new Source(html);
398    
399                    Renderer renderer = source.getRenderer();
400    
401                    return renderer.toString();
402            }
403    
404            @Override
405            public String replaceMsWordCharacters(String text) {
406                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
407            }
408    
409            @Override
410            public String replaceNewLine(String text) {
411                    if (text == null) {
412                            return null;
413                    }
414    
415                    return text.replaceAll("\r?\n", "<br />");
416            }
417    
418            @Override
419            public String stripBetween(String text, String tag) {
420                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
421            }
422    
423            @Override
424            public String stripComments(String text) {
425                    return StringUtil.stripBetween(text, "<!--", "-->");
426            }
427    
428            @Override
429            public String stripHtml(String text) {
430                    if (text == null) {
431                            return null;
432                    }
433    
434                    text = stripComments(text);
435    
436                    StringBuilder sb = new StringBuilder(text.length());
437    
438                    int x = 0;
439                    int y = text.indexOf("<");
440    
441                    while (y != -1) {
442                            sb.append(text.substring(x, y));
443                            sb.append(StringPool.SPACE);
444    
445                            // Look for text enclosed by <abc></abc>
446    
447                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
448                                    y = stripTag(_TAG_SCRIPT, text, y);
449                            }
450                            else if (isTag(_TAG_STYLE, text, y + 1)) {
451                                    y = stripTag(_TAG_STYLE, text, y);
452                            }
453    
454                            x = text.indexOf(">", y);
455    
456                            if (x == -1) {
457                                    break;
458                            }
459    
460                            x++;
461    
462                            if (x < y) {
463    
464                                    // <b>Hello</b
465    
466                                    break;
467                            }
468    
469                            y = text.indexOf("<", x);
470                    }
471    
472                    if (y == -1) {
473                            sb.append(text.substring(x));
474                    }
475    
476                    return sb.toString();
477            }
478    
479            @Override
480            public String toInputSafe(String text) {
481                    return StringUtil.replace(
482                            text,
483                            new String[] {"&", "\""},
484                            new String[] {"&amp;", "&quot;"});
485            }
486    
487            @Override
488            public String unescape(String text) {
489                    if (text == null) {
490                            return null;
491                    }
492    
493                    if (text.length() == 0) {
494                            return StringPool.BLANK;
495                    }
496    
497                    // Optimize this
498    
499                    text = StringUtil.replace(text, "&lt;", "<");
500                    text = StringUtil.replace(text, "&gt;", ">");
501                    text = StringUtil.replace(text, "&amp;", "&");
502                    text = StringUtil.replace(text, "&rsquo;", "\u2019");
503                    text = StringUtil.replace(text, "&#034;", "\"");
504                    text = StringUtil.replace(text, "&#039;", "'");
505                    text = StringUtil.replace(text, "&#040;", "(");
506                    text = StringUtil.replace(text, "&#041;", ")");
507                    text = StringUtil.replace(text, "&#044;", ",");
508                    text = StringUtil.replace(text, "&#035;", "#");
509                    text = StringUtil.replace(text, "&#037;", "%");
510                    text = StringUtil.replace(text, "&#059;", ";");
511                    text = StringUtil.replace(text, "&#061;", "=");
512                    text = StringUtil.replace(text, "&#043;", "+");
513                    text = StringUtil.replace(text, "&#045;", "-");
514    
515                    return text;
516            }
517    
518            @Override
519            public String unescapeCDATA(String text) {
520                    if (text == null) {
521                            return null;
522                    }
523    
524                    if (text.length() == 0) {
525                            return StringPool.BLANK;
526                    }
527    
528                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
529                    text = StringUtil.replace(text, "]]&gt;", "]]>");
530    
531                    return text;
532            }
533    
534            @Override
535            public String wordBreak(String text, int columns) {
536                    StringBundler sb = new StringBundler();
537    
538                    int length = 0;
539                    int lastWrite = 0;
540                    int pos = 0;
541    
542                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
543    
544                    Matcher matcher = pattern.matcher(text);
545    
546                    while (matcher.find()) {
547                            if (matcher.start() < pos) {
548                                    continue;
549                            }
550    
551                            while ((length + matcher.start() - pos) >= columns) {
552                                    pos += columns - length;
553    
554                                    sb.append(text.substring(lastWrite, pos));
555                                    sb.append("<wbr/>&shy;");
556    
557                                    length = 0;
558                                    lastWrite = pos;
559                            }
560    
561                            length += matcher.start() - pos;
562    
563                            String group = matcher.group();
564    
565                            if (group.equals(StringPool.AMPERSAND)) {
566                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
567    
568                                    if (x != -1) {
569                                            length++;
570                                            pos = x + 1;
571                                    }
572    
573                                    continue;
574                            }
575    
576                            if (group.equals(StringPool.LESS_THAN)) {
577                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
578    
579                                    if (x != -1) {
580                                            pos = x + 1;
581                                    }
582    
583                                    continue;
584                            }
585    
586                            if (group.equals(StringPool.SPACE) ||
587                                    group.equals(StringPool.NEW_LINE)) {
588    
589                                    length = 0;
590                                    pos = matcher.start() + 1;
591                            }
592                    }
593    
594                    sb.append(text.substring(lastWrite));
595    
596                    return sb.toString();
597            }
598    
599            protected boolean isTag(char[] tag, String text, int pos) {
600                    if ((pos + tag.length + 1) <= text.length()) {
601                            char item;
602    
603                            for (int i = 0; i < tag.length; i++) {
604                                    item = text.charAt(pos++);
605    
606                                    if (Character.toLowerCase(item) != tag[i]) {
607                                            return false;
608                                    }
609                            }
610    
611                            item = text.charAt(pos);
612    
613                            // Check that char after tag is not a letter (i.e. another tag)
614    
615                            return !Character.isLetter(item);
616                    }
617                    else {
618                            return false;
619                    }
620            }
621    
622            protected int stripTag(char[] tag, String text, int pos) {
623                    int x = pos + _TAG_SCRIPT.length;
624    
625                    // Find end of the tag
626    
627                    x = text.indexOf(">", x);
628    
629                    if (x < 0) {
630                            return pos;
631                    }
632    
633                    // Check if preceding character is / (i.e. is this instance of <abc/>)
634    
635                    if (text.charAt(x-1) == '/') {
636                            return pos;
637                    }
638    
639                    // Search for the ending </abc> tag
640    
641                    while (true) {
642                            x = text.indexOf("</", x);
643    
644                            if (x >= 0) {
645                                    if (isTag(tag, text, x + 2)) {
646                                            pos = x;
647    
648                                            break;
649                                    }
650                                    else {
651    
652                                            // Skip past "</"
653    
654                                            x += 2;
655                                    }
656                            }
657                            else {
658                                    break;
659                            }
660                    }
661    
662                    return pos;
663            }
664    
665            private static final String[] _MS_WORD_HTML = new String[] {
666                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
667            };
668    
669            private static final String[] _MS_WORD_UNICODE = new String[] {
670                    "\u00ae", "\u2019", "\u201c", "\u201d"
671            };
672    
673            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
674    
675            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
676    
677            // See http://www.w3.org/TR/xpath20/#lexical-structure
678    
679            private static final char[] _XPATH_TOKENS = {
680                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
681                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
682    
683    }