001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * The contents of this file are subject to the terms of the Liferay Enterprise
005     * Subscription License ("License"). You may not use this file except in
006     * compliance with the License. You can obtain a copy of the License by
007     * contacting Liferay, Inc. See the License for the specific language governing
008     * permissions and limitations under the License, including but not limited to
009     * distribution rights of the Software.
010     *
011     *
012     *
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            /**
055             * Escapes the text so that it is safe to use in an HTML context.
056             *
057             * @param  text the text to escape
058             * @return the escaped HTML text, or <code>null</code> if the text is
059             *         <code>null</code>
060             */
061            @Override
062            public String escape(String text) {
063                    if (text == null) {
064                            return null;
065                    }
066    
067                    if (text.length() == 0) {
068                            return StringPool.BLANK;
069                    }
070    
071                    // Escape using XSS recommendations from
072                    // http://www.owasp.org/index.php/Cross_Site_Scripting
073                    // #How_to_Protect_Yourself
074    
075                    StringBundler sb = null;
076    
077                    int lastReplacementIndex = 0;
078    
079                    for (int i = 0; i < text.length(); i++) {
080                            char c = text.charAt(i);
081    
082                            String replacement = null;
083    
084                            if (c == '<') {
085                                    replacement = "&lt;";
086                            }
087                            else if (c == '>') {
088                                    replacement = "&gt;";
089                            }
090                            else if (c == '&') {
091                                    replacement = "&amp;";
092                            }
093                            else if (c == '"') {
094                                    replacement = "&#034;";
095                            }
096                            else if (c == '\'') {
097                                    replacement = "&#039;";
098                            }
099                            else if (c == '\u00bb') {
100                                    replacement = "&#187;";
101                            }
102                            else if (c == '\u2013') {
103                                    replacement = "&#x2013;";
104                            }
105                            else if (c == '\u2014') {
106                                    replacement = "&#x2014;";
107                            }
108                            else if (c == '\u2028') {
109                                    replacement = "&#x8232;";
110                            }
111                            else if (!_isValidXmlCharacter(c) ||
112                                             _isUnicodeCompatibilityCharacter(c)) {
113    
114                                    replacement = StringPool.SPACE;
115                            }
116    
117                            if (replacement != null) {
118                                    if (sb == null) {
119                                            sb = new StringBundler();
120                                    }
121    
122                                    if (i > lastReplacementIndex) {
123                                            sb.append(text.substring(lastReplacementIndex, i));
124                                    }
125    
126                                    sb.append(replacement);
127    
128                                    lastReplacementIndex = i + 1;
129                            }
130                    }
131    
132                    if (sb == null) {
133                            return text;
134                    }
135    
136                    if (lastReplacementIndex < text.length()) {
137                            sb.append(text.substring(lastReplacementIndex));
138                    }
139    
140                    return sb.toString();
141            }
142    
143            @Override
144            public String escape(String text, int type) {
145                    if (text == null) {
146                            return null;
147                    }
148    
149                    if (text.length() == 0) {
150                            return StringPool.BLANK;
151                    }
152    
153                    String prefix = StringPool.BLANK;
154                    String postfix = StringPool.BLANK;
155    
156                    if (type == ESCAPE_MODE_ATTRIBUTE) {
157                            prefix = "&#x";
158                            postfix = StringPool.SEMICOLON;
159                    }
160                    else if (type == ESCAPE_MODE_CSS) {
161                            prefix = StringPool.BACK_SLASH;
162                    }
163                    else if (type == ESCAPE_MODE_JS) {
164                            prefix = "\\x";
165                    }
166                    else if (type == ESCAPE_MODE_URL) {
167                            return HttpUtil.encodeURL(text, true);
168                    }
169                    else {
170                            return escape(text);
171                    }
172    
173                    StringBuilder sb = new StringBuilder();
174    
175                    for (int i = 0; i < text.length(); i++) {
176                            char c = text.charAt(i);
177    
178                            if ((type == ESCAPE_MODE_ATTRIBUTE) &&
179                                    (!_isValidXmlCharacter(c) ||
180                                     _isUnicodeCompatibilityCharacter(c))) {
181    
182                                    sb.append(StringPool.SPACE);
183                            }
184                            else if ((c > 255) || Character.isLetterOrDigit(c) ||
185                                             (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
186    
187                                    sb.append(c);
188                            }
189                            else {
190                                    sb.append(prefix);
191    
192                                    String hexString = StringUtil.toHexString(c);
193    
194                                    if (hexString.length() == 1) {
195                                            sb.append(StringPool.ASCII_TABLE[48]);
196                                    }
197    
198                                    sb.append(hexString);
199                                    sb.append(postfix);
200                            }
201                    }
202    
203                    if ((type != ESCAPE_MODE_ATTRIBUTE) && (sb.length() == text.length())) {
204                            return text;
205                    }
206                    else {
207                            return sb.toString();
208                    }
209            }
210    
211            @Override
212            public String escapeAttribute(String attribute) {
213                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
214            }
215    
216            @Override
217            public String escapeCSS(String css) {
218                    return escape(css, ESCAPE_MODE_CSS);
219            }
220    
221            @Override
222            public String escapeHREF(String href) {
223                    if (href == null) {
224                            return null;
225                    }
226    
227                    if (href.length() == 0) {
228                            return StringPool.BLANK;
229                    }
230    
231                    int index = href.indexOf(StringPool.COLON);
232    
233                    if (index == 4) {
234                            String protocol = StringUtil.toLowerCase(href.substring(0, 4));
235    
236                            if (protocol.equals("data")) {
237                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
238                            }
239                    }
240                    else if (index == 10) {
241                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
242    
243                            if (protocol.equals("javascript")) {
244                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
245                            }
246                    }
247    
248                    return escapeAttribute(href);
249            }
250    
251            @Override
252            public String escapeJS(String js) {
253                    return escape(js, ESCAPE_MODE_JS);
254            }
255    
256            @Override
257            public String escapeURL(String url) {
258                    return escape(url, ESCAPE_MODE_URL);
259            }
260    
261            @Override
262            public String escapeXPath(String xPath) {
263                    if (Validator.isNull(xPath)) {
264                            return xPath;
265                    }
266    
267                    StringBuilder sb = new StringBuilder(xPath.length());
268    
269                    for (int i = 0; i < xPath.length(); i++) {
270                            char c = xPath.charAt(i);
271    
272                            boolean hasToken = false;
273    
274                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
275                                    if (c == _XPATH_TOKENS[j]) {
276                                            hasToken = true;
277    
278                                            break;
279                                    }
280                            }
281    
282                            if (hasToken) {
283                                    sb.append(StringPool.UNDERLINE);
284                            }
285                            else {
286                                    sb.append(c);
287                            }
288                    }
289    
290                    return sb.toString();
291            }
292    
293            @Override
294            public String escapeXPathAttribute(String xPathAttribute) {
295                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
296                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
297    
298                    if (hasQuote && hasApostrophe) {
299                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
300    
301                            return "concat('".concat(
302                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
303                    }
304    
305                    if (hasQuote) {
306                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
307                                    StringPool.APOSTROPHE);
308                    }
309    
310                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
311            }
312    
313            @Override
314            public String extractText(String html) {
315                    if (html == null) {
316                            return null;
317                    }
318    
319                    Source source = new Source(html);
320    
321                    TextExtractor textExtractor = source.getTextExtractor();
322    
323                    return textExtractor.toString();
324            }
325    
326            @Override
327            public String fromInputSafe(String text) {
328                    return StringUtil.replace(text, "&amp;", "&");
329            }
330    
331            @Override
332            public String getAUICompatibleId(String text) {
333                    if (Validator.isNull(text)) {
334                            return text;
335                    }
336    
337                    StringBundler sb = null;
338    
339                    int lastReplacementIndex = 0;
340    
341                    for (int i = 0; i < text.length(); i++) {
342                            char c = text.charAt(i);
343    
344                            if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
345                                    ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
346                                     (c != CharPool.NARROW_NO_BREAK_SPACE) &&
347                                     (c != CharPool.NO_BREAK_SPACE))) {
348    
349                                    continue;
350                            }
351    
352                            if (sb == null) {
353                                    sb = new StringBundler();
354                            }
355    
356                            if (i > lastReplacementIndex) {
357                                    sb.append(text.substring(lastReplacementIndex, i));
358                            }
359    
360                            sb.append(CharPool.UNDERLINE);
361    
362                            if (c != CharPool.UNDERLINE) {
363                                    sb.append(StringUtil.toHexString(c));
364                            }
365    
366                            sb.append(CharPool.UNDERLINE);
367    
368                            lastReplacementIndex = i + 1;
369                    }
370    
371                    if (sb == null) {
372                            return text;
373                    }
374    
375                    if (lastReplacementIndex < text.length()) {
376                            sb.append(text.substring(lastReplacementIndex));
377                    }
378    
379                    return sb.toString();
380            }
381    
382            @Deprecated
383            @Override
384            public String render(String html) {
385                    if (html == null) {
386                            return null;
387                    }
388    
389                    Source source = new Source(html);
390    
391                    Renderer renderer = source.getRenderer();
392    
393                    return renderer.toString();
394            }
395    
396            @Override
397            public String replaceMsWordCharacters(String text) {
398                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
399            }
400    
401            @Override
402            public String replaceNewLine(String text) {
403                    if (text == null) {
404                            return null;
405                    }
406    
407                    return text.replaceAll("\r?\n", "<br />");
408            }
409    
410            @Override
411            public String stripBetween(String text, String tag) {
412                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
413            }
414    
415            @Override
416            public String stripComments(String text) {
417                    return StringUtil.stripBetween(text, "<!--", "-->");
418            }
419    
420            @Override
421            public String stripHtml(String text) {
422                    if (text == null) {
423                            return null;
424                    }
425    
426                    text = stripComments(text);
427    
428                    StringBuilder sb = new StringBuilder(text.length());
429    
430                    int x = 0;
431                    int y = text.indexOf("<");
432    
433                    while (y != -1) {
434                            sb.append(text.substring(x, y));
435                            sb.append(StringPool.SPACE);
436    
437                            // Look for text enclosed by <abc></abc>
438    
439                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
440                                    y = stripTag(_TAG_SCRIPT, text, y);
441                            }
442                            else if (isTag(_TAG_STYLE, text, y + 1)) {
443                                    y = stripTag(_TAG_STYLE, text, y);
444                            }
445    
446                            x = text.indexOf(">", y);
447    
448                            if (x == -1) {
449                                    break;
450                            }
451    
452                            x++;
453    
454                            if (x < y) {
455    
456                                    // <b>Hello</b
457    
458                                    break;
459                            }
460    
461                            y = text.indexOf("<", x);
462                    }
463    
464                    if (y == -1) {
465                            sb.append(text.substring(x));
466                    }
467    
468                    return sb.toString();
469            }
470    
471            @Override
472            public String toInputSafe(String text) {
473                    return StringUtil.replace(
474                            text,
475                            new String[] {"&", "\""},
476                            new String[] {"&amp;", "&quot;"});
477            }
478    
479            @Override
480            public String unescape(String text) {
481                    if (text == null) {
482                            return null;
483                    }
484    
485                    if (text.length() == 0) {
486                            return StringPool.BLANK;
487                    }
488    
489                    // Optimize this
490    
491                    text = StringUtil.replace(text, "&lt;", "<");
492                    text = StringUtil.replace(text, "&gt;", ">");
493                    text = StringUtil.replace(text, "&amp;", "&");
494                    text = StringUtil.replace(text, "&rsquo;", "\u2019");
495                    text = StringUtil.replace(text, "&#034;", "\"");
496                    text = StringUtil.replace(text, "&#039;", "'");
497                    text = StringUtil.replace(text, "&#040;", "(");
498                    text = StringUtil.replace(text, "&#041;", ")");
499                    text = StringUtil.replace(text, "&#044;", ",");
500                    text = StringUtil.replace(text, "&#035;", "#");
501                    text = StringUtil.replace(text, "&#037;", "%");
502                    text = StringUtil.replace(text, "&#059;", ";");
503                    text = StringUtil.replace(text, "&#061;", "=");
504                    text = StringUtil.replace(text, "&#043;", "+");
505                    text = StringUtil.replace(text, "&#045;", "-");
506    
507                    return text;
508            }
509    
510            @Override
511            public String unescapeCDATA(String text) {
512                    if (text == null) {
513                            return null;
514                    }
515    
516                    if (text.length() == 0) {
517                            return StringPool.BLANK;
518                    }
519    
520                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
521                    text = StringUtil.replace(text, "]]&gt;", "]]>");
522    
523                    return text;
524            }
525    
526            @Override
527            public String wordBreak(String text, int columns) {
528                    StringBundler sb = new StringBundler();
529    
530                    int length = 0;
531                    int lastWrite = 0;
532                    int pos = 0;
533    
534                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
535    
536                    Matcher matcher = pattern.matcher(text);
537    
538                    while (matcher.find()) {
539                            if (matcher.start() < pos) {
540                                    continue;
541                            }
542    
543                            while ((length + matcher.start() - pos) >= columns) {
544                                    pos += columns - length;
545    
546                                    sb.append(text.substring(lastWrite, pos));
547                                    sb.append("<wbr/>&shy;");
548    
549                                    length = 0;
550                                    lastWrite = pos;
551                            }
552    
553                            length += matcher.start() - pos;
554    
555                            String group = matcher.group();
556    
557                            if (group.equals(StringPool.AMPERSAND)) {
558                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
559    
560                                    if (x != -1) {
561                                            length++;
562                                            pos = x + 1;
563                                    }
564    
565                                    continue;
566                            }
567    
568                            if (group.equals(StringPool.LESS_THAN)) {
569                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
570    
571                                    if (x != -1) {
572                                            pos = x + 1;
573                                    }
574    
575                                    continue;
576                            }
577    
578                            if (group.equals(StringPool.SPACE) ||
579                                    group.equals(StringPool.NEW_LINE)) {
580    
581                                    length = 0;
582                                    pos = matcher.start() + 1;
583                            }
584                    }
585    
586                    sb.append(text.substring(lastWrite));
587    
588                    return sb.toString();
589            }
590    
591            protected boolean isTag(char[] tag, String text, int pos) {
592                    if ((pos + tag.length + 1) <= text.length()) {
593                            char item;
594    
595                            for (int i = 0; i < tag.length; i++) {
596                                    item = text.charAt(pos++);
597    
598                                    if (Character.toLowerCase(item) != tag[i]) {
599                                            return false;
600                                    }
601                            }
602    
603                            item = text.charAt(pos);
604    
605                            // Check that char after tag is not a letter (i.e. another tag)
606    
607                            return !Character.isLetter(item);
608                    }
609                    else {
610                            return false;
611                    }
612            }
613    
614            protected int stripTag(char[] tag, String text, int pos) {
615                    int x = pos + _TAG_SCRIPT.length;
616    
617                    // Find end of the tag
618    
619                    x = text.indexOf(">", x);
620    
621                    if (x < 0) {
622                            return pos;
623                    }
624    
625                    // Check if preceding character is / (i.e. is this instance of <abc/>)
626    
627                    if (text.charAt(x-1) == '/') {
628                            return pos;
629                    }
630    
631                    // Search for the ending </abc> tag
632    
633                    while (true) {
634                            x = text.indexOf("</", x);
635    
636                            if (x >= 0) {
637                                    if (isTag(tag, text, x + 2)) {
638                                            pos = x;
639    
640                                            break;
641                                    }
642                                    else {
643    
644                                            // Skip past "</"
645    
646                                            x += 2;
647                                    }
648                            }
649                            else {
650                                    break;
651                            }
652                    }
653    
654                    return pos;
655            }
656    
657            private boolean _isUnicodeCompatibilityCharacter(char c) {
658                    if (((c >= '\u007f') && (c <= '\u0084')) ||
659                            ((c >= '\u0086') && (c <= '\u009f')) ||
660                            ((c >= '\ufdd0') && (c <= '\ufdef'))) {
661    
662                            return true;
663                    }
664    
665                    return false;
666            }
667    
668            private boolean _isValidXmlCharacter(char c) {
669                    if ((c == '\u0009') || (c == CharPool.NEW_LINE) ||
670                            (c == CharPool.RETURN) || ((c >= '\u0020') && (c <= '\ud7ff')) ||
671                            ((c >= '\ue000') && (c <= '\ufffd')) ||
672                            Character.isLowSurrogate(c) || Character.isHighSurrogate(c)) {
673    
674                            return true;
675                    }
676    
677                    return false;
678            }
679    
680            private static final String[] _MS_WORD_HTML = new String[] {
681                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
682            };
683    
684            private static final String[] _MS_WORD_UNICODE = new String[] {
685                    "\u00ae", "\u2019", "\u201c", "\u201d"
686            };
687    
688            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
689    
690            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
691    
692            // See http://www.w3.org/TR/xpath20/#lexical-structure
693    
694            private static final char[] _XPATH_TOKENS = {
695                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
696                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
697    
698    }