001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            /**
055             * Escapes the text so that it is safe to use in an HTML context.
056             *
057             * @param  text the text to escape
058             * @return the escaped HTML text, or <code>null</code> if the text is
059             *         <code>null</code>
060             */
061            @Override
062            public String escape(String text) {
063                    if (text == null) {
064                            return null;
065                    }
066    
067                    if (text.length() == 0) {
068                            return StringPool.BLANK;
069                    }
070    
071                    // Escape using XSS recommendations from
072                    // http://www.owasp.org/index.php/Cross_Site_Scripting
073                    // #How_to_Protect_Yourself
074    
075                    StringBundler sb = null;
076    
077                    int lastReplacementIndex = 0;
078    
079                    for (int i = 0; i < text.length(); i++) {
080                            char c = text.charAt(i);
081    
082                            String replacement = null;
083    
084                            switch (c) {
085                                    case '<':
086                                            replacement = "&lt;";
087    
088                                            break;
089    
090                                    case '>':
091                                            replacement = "&gt;";
092    
093                                            break;
094    
095                                    case '&':
096                                            replacement = "&amp;";
097    
098                                            break;
099    
100                                    case '"':
101                                            replacement = "&#034;";
102    
103                                            break;
104    
105                                    case '\'':
106                                            replacement = "&#039;";
107    
108                                            break;
109    
110                                    case '\u00bb': // '???'
111                                            replacement = "&#187;";
112    
113                                            break;
114    
115                                    case '\u2013':
116                                            replacement = "&#x2013;";
117    
118                                            break;
119    
120                                    case '\u2014':
121                                            replacement = "&#x2014;";
122    
123                                            break;
124                            }
125    
126                            if (replacement != null) {
127                                    if (sb == null) {
128                                            sb = new StringBundler();
129                                    }
130    
131                                    if (i > lastReplacementIndex) {
132                                            sb.append(text.substring(lastReplacementIndex, i));
133                                    }
134    
135                                    sb.append(replacement);
136    
137                                    lastReplacementIndex = i + 1;
138                            }
139                    }
140    
141                    if (sb == null) {
142                            return text;
143                    }
144    
145                    if (lastReplacementIndex < text.length()) {
146                            sb.append(text.substring(lastReplacementIndex));
147                    }
148    
149                    return sb.toString();
150            }
151    
152            @Override
153            public String escape(String text, int type) {
154                    if (text == null) {
155                            return null;
156                    }
157    
158                    if (text.length() == 0) {
159                            return StringPool.BLANK;
160                    }
161    
162                    String prefix = StringPool.BLANK;
163                    String postfix = StringPool.BLANK;
164    
165                    if (type == ESCAPE_MODE_ATTRIBUTE) {
166                            prefix = "&#x";
167                            postfix = StringPool.SEMICOLON;
168                    }
169                    else if (type == ESCAPE_MODE_CSS) {
170                            prefix = StringPool.BACK_SLASH;
171                    }
172                    else if (type == ESCAPE_MODE_JS) {
173                            prefix = "\\x";
174                    }
175                    else if (type == ESCAPE_MODE_URL) {
176                            return HttpUtil.encodeURL(text, true);
177                    }
178                    else {
179                            return escape(text);
180                    }
181    
182                    StringBuilder sb = new StringBuilder();
183    
184                    for (int i = 0; i < text.length(); i++) {
185                            char c = text.charAt(i);
186    
187                            if ((c > 255) || Character.isLetterOrDigit(c) ||
188                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
189    
190                                    sb.append(c);
191                            }
192                            else {
193                                    sb.append(prefix);
194    
195                                    String hexString = StringUtil.toHexString(c);
196    
197                                    if (hexString.length() == 1) {
198                                            sb.append(StringPool.ASCII_TABLE[48]);
199                                    }
200    
201                                    sb.append(hexString);
202                                    sb.append(postfix);
203                            }
204                    }
205    
206                    if (sb.length() == text.length()) {
207                            return text;
208                    }
209                    else {
210                            return sb.toString();
211                    }
212            }
213    
214            @Override
215            public String escapeAttribute(String attribute) {
216                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
217            }
218    
219            @Override
220            public String escapeCSS(String css) {
221                    return escape(css, ESCAPE_MODE_CSS);
222            }
223    
224            @Override
225            public String escapeHREF(String href) {
226                    if (href == null) {
227                            return null;
228                    }
229    
230                    if (href.length() == 0) {
231                            return StringPool.BLANK;
232                    }
233    
234                    if (href.indexOf(StringPool.COLON) == 10) {
235                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
236    
237                            if (protocol.equals("javascript")) {
238                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
239                            }
240                    }
241    
242                    return escapeAttribute(href);
243            }
244    
245            @Override
246            public String escapeJS(String js) {
247                    return escape(js, ESCAPE_MODE_JS);
248            }
249    
250            @Override
251            public String escapeURL(String url) {
252                    return escape(url, ESCAPE_MODE_URL);
253            }
254    
255            @Override
256            public String escapeXPath(String xPath) {
257                    if (Validator.isNull(xPath)) {
258                            return xPath;
259                    }
260    
261                    StringBuilder sb = new StringBuilder(xPath.length());
262    
263                    for (int i = 0; i < xPath.length(); i++) {
264                            char c = xPath.charAt(i);
265    
266                            boolean hasToken = false;
267    
268                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
269                                    if (c == _XPATH_TOKENS[j]) {
270                                            hasToken = true;
271    
272                                            break;
273                                    }
274                            }
275    
276                            if (hasToken) {
277                                    sb.append(StringPool.UNDERLINE);
278                            }
279                            else {
280                                    sb.append(c);
281                            }
282                    }
283    
284                    return sb.toString();
285            }
286    
287            @Override
288            public String escapeXPathAttribute(String xPathAttribute) {
289                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
290                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
291    
292                    if (hasQuote && hasApostrophe) {
293                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
294    
295                            return "concat('".concat(
296                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
297                    }
298    
299                    if (hasQuote) {
300                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
301                                    StringPool.APOSTROPHE);
302                    }
303    
304                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
305            }
306    
307            @Override
308            public String extractText(String html) {
309                    if (html == null) {
310                            return null;
311                    }
312    
313                    Source source = new Source(html);
314    
315                    TextExtractor textExtractor = source.getTextExtractor();
316    
317                    return textExtractor.toString();
318            }
319    
320            @Override
321            public String fromInputSafe(String text) {
322                    return StringUtil.replace(text, "&amp;", "&");
323            }
324    
325            @Override
326            public String getAUICompatibleId(String text) {
327                    if (Validator.isNull(text)) {
328                            return text;
329                    }
330    
331                    StringBundler sb = null;
332    
333                    int lastReplacementIndex = 0;
334    
335                    for (int i = 0; i < text.length(); i++) {
336                            char c = text.charAt(i);
337    
338                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
339                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
340                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
341                                     (c != CharPool.NO_BREAK_SPACE))) {
342    
343                                    continue;
344                            }
345    
346                            if (sb == null) {
347                                    sb = new StringBundler();
348                            }
349    
350                            if (i > lastReplacementIndex) {
351                                    sb.append(text.substring(lastReplacementIndex, i));
352                            }
353    
354                            sb.append(CharPool.UNDERLINE);
355    
356                            if (c != CharPool.UNDERLINE) {
357                                    sb.append(StringUtil.toHexString(c));
358                            }
359    
360                            sb.append(CharPool.UNDERLINE);
361    
362                            lastReplacementIndex = i + 1;
363                    }
364    
365                    if (sb == null) {
366                            return text;
367                    }
368    
369                    if (lastReplacementIndex < text.length()) {
370                            sb.append(text.substring(lastReplacementIndex));
371                    }
372    
373                    return sb.toString();
374            }
375    
376            @Deprecated
377            @Override
378            public String render(String html) {
379                    if (html == null) {
380                            return null;
381                    }
382    
383                    Source source = new Source(html);
384    
385                    Renderer renderer = source.getRenderer();
386    
387                    return renderer.toString();
388            }
389    
390            @Override
391            public String replaceMsWordCharacters(String text) {
392                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
393            }
394    
395            @Override
396            public String replaceNewLine(String text) {
397                    if (text == null) {
398                            return null;
399                    }
400    
401                    return text.replaceAll("\r?\n", "<br />");
402            }
403    
404            @Override
405            public String stripBetween(String text, String tag) {
406                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
407            }
408    
409            @Override
410            public String stripComments(String text) {
411                    return StringUtil.stripBetween(text, "<!--", "-->");
412            }
413    
414            @Override
415            public String stripHtml(String text) {
416                    if (text == null) {
417                            return null;
418                    }
419    
420                    text = stripComments(text);
421    
422                    StringBuilder sb = new StringBuilder(text.length());
423    
424                    int x = 0;
425                    int y = text.indexOf("<");
426    
427                    while (y != -1) {
428                            sb.append(text.substring(x, y));
429                            sb.append(StringPool.SPACE);
430    
431                            // Look for text enclosed by <abc></abc>
432    
433                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
434                                    y = stripTag(_TAG_SCRIPT, text, y);
435                            }
436                            else if (isTag(_TAG_STYLE, text, y + 1)) {
437                                    y = stripTag(_TAG_STYLE, text, y);
438                            }
439    
440                            x = text.indexOf(">", y);
441    
442                            if (x == -1) {
443                                    break;
444                            }
445    
446                            x++;
447    
448                            if (x < y) {
449    
450                                    // <b>Hello</b
451    
452                                    break;
453                            }
454    
455                            y = text.indexOf("<", x);
456                    }
457    
458                    if (y == -1) {
459                            sb.append(text.substring(x));
460                    }
461    
462                    return sb.toString();
463            }
464    
465            @Override
466            public String toInputSafe(String text) {
467                    return StringUtil.replace(
468                            text,
469                            new String[] {"&", "\""},
470                            new String[] {"&amp;", "&quot;"});
471            }
472    
473            @Override
474            public String unescape(String text) {
475                    if (text == null) {
476                            return null;
477                    }
478    
479                    if (text.length() == 0) {
480                            return StringPool.BLANK;
481                    }
482    
483                    // Optimize this
484    
485                    text = StringUtil.replace(text, "&lt;", "<");
486                    text = StringUtil.replace(text, "&gt;", ">");
487                    text = StringUtil.replace(text, "&amp;", "&");
488                    text = StringUtil.replace(text, "&rsquo;", "\u2019");
489                    text = StringUtil.replace(text, "&#034;", "\"");
490                    text = StringUtil.replace(text, "&#039;", "'");
491                    text = StringUtil.replace(text, "&#040;", "(");
492                    text = StringUtil.replace(text, "&#041;", ")");
493                    text = StringUtil.replace(text, "&#044;", ",");
494                    text = StringUtil.replace(text, "&#035;", "#");
495                    text = StringUtil.replace(text, "&#037;", "%");
496                    text = StringUtil.replace(text, "&#059;", ";");
497                    text = StringUtil.replace(text, "&#061;", "=");
498                    text = StringUtil.replace(text, "&#043;", "+");
499                    text = StringUtil.replace(text, "&#045;", "-");
500    
501                    return text;
502            }
503    
504            @Override
505            public String unescapeCDATA(String text) {
506                    if (text == null) {
507                            return null;
508                    }
509    
510                    if (text.length() == 0) {
511                            return StringPool.BLANK;
512                    }
513    
514                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
515                    text = StringUtil.replace(text, "]]&gt;", "]]>");
516    
517                    return text;
518            }
519    
520            @Override
521            public String wordBreak(String text, int columns) {
522                    StringBundler sb = new StringBundler();
523    
524                    int length = 0;
525                    int lastWrite = 0;
526                    int pos = 0;
527    
528                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
529    
530                    Matcher matcher = pattern.matcher(text);
531    
532                    while (matcher.find()) {
533                            if (matcher.start() < pos) {
534                                    continue;
535                            }
536    
537                            while ((length + matcher.start() - pos) >= columns) {
538                                    pos += columns - length;
539    
540                                    sb.append(text.substring(lastWrite, pos));
541                                    sb.append("<wbr/>&shy;");
542    
543                                    length = 0;
544                                    lastWrite = pos;
545                            }
546    
547                            length += matcher.start() - pos;
548    
549                            String group = matcher.group();
550    
551                            if (group.equals(StringPool.AMPERSAND)) {
552                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
553    
554                                    if (x != -1) {
555                                            length++;
556                                            pos = x + 1;
557                                    }
558    
559                                    continue;
560                            }
561    
562                            if (group.equals(StringPool.LESS_THAN)) {
563                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
564    
565                                    if (x != -1) {
566                                            pos = x + 1;
567                                    }
568    
569                                    continue;
570                            }
571    
572                            if (group.equals(StringPool.SPACE) ||
573                                    group.equals(StringPool.NEW_LINE)) {
574    
575                                    length = 0;
576                                    pos = matcher.start() + 1;
577                            }
578                    }
579    
580                    sb.append(text.substring(lastWrite));
581    
582                    return sb.toString();
583            }
584    
585            protected boolean isTag(char[] tag, String text, int pos) {
586                    if ((pos + tag.length + 1) <= text.length()) {
587                            char item;
588    
589                            for (int i = 0; i < tag.length; i++) {
590                                    item = text.charAt(pos++);
591    
592                                    if (Character.toLowerCase(item) != tag[i]) {
593                                            return false;
594                                    }
595                            }
596    
597                            item = text.charAt(pos);
598    
599                            // Check that char after tag is not a letter (i.e. another tag)
600    
601                            return !Character.isLetter(item);
602                    }
603                    else {
604                            return false;
605                    }
606            }
607    
608            protected int stripTag(char[] tag, String text, int pos) {
609                    int x = pos + _TAG_SCRIPT.length;
610    
611                    // Find end of the tag
612    
613                    x = text.indexOf(">", x);
614    
615                    if (x < 0) {
616                            return pos;
617                    }
618    
619                    // Check if preceding character is / (i.e. is this instance of <abc/>)
620    
621                    if (text.charAt(x-1) == '/') {
622                            return pos;
623                    }
624    
625                    // Search for the ending </abc> tag
626    
627                    while (true) {
628                            x = text.indexOf("</", x);
629    
630                            if (x >= 0) {
631                                    if (isTag(tag, text, x + 2)) {
632                                            pos = x;
633    
634                                            break;
635                                    }
636                                    else {
637    
638                                            // Skip past "</"
639    
640                                            x += 2;
641                                    }
642                            }
643                            else {
644                                    break;
645                            }
646                    }
647    
648                    return pos;
649            }
650    
651            private static final String[] _MS_WORD_HTML = new String[] {
652                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
653            };
654    
655            private static final String[] _MS_WORD_UNICODE = new String[] {
656                    "\u00ae", "\u2019", "\u201c", "\u201d"
657            };
658    
659            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
660    
661            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
662    
663            // See http://www.w3.org/TR/xpath20/#lexical-structure
664    
665            private static final char[] _XPATH_TOKENS = {
666                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
667                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
668    
669    }