001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            @Override
055            public String escape(String text) {
056                    if (text == null) {
057                            return null;
058                    }
059    
060                    if (text.length() == 0) {
061                            return StringPool.BLANK;
062                    }
063    
064                    // Escape using XSS recommendations from
065                    // http://www.owasp.org/index.php/Cross_Site_Scripting
066                    // #How_to_Protect_Yourself
067    
068                    StringBundler sb = null;
069    
070                    int lastReplacementIndex = 0;
071    
072                    for (int i = 0; i < text.length(); i++) {
073                            char c = text.charAt(i);
074    
075                            String replacement = null;
076    
077                            switch (c) {
078                                    case '<':
079                                            replacement = "&lt;";
080    
081                                            break;
082    
083                                    case '>':
084                                            replacement = "&gt;";
085    
086                                            break;
087    
088                                    case '&':
089                                            replacement = "&amp;";
090    
091                                            break;
092    
093                                    case '"':
094                                            replacement = "&#034;";
095    
096                                            break;
097    
098                                    case '\'':
099                                            replacement = "&#039;";
100    
101                                            break;
102    
103                                    case '\u00bb': // '???'
104                                            replacement = "&#187;";
105    
106                                            break;
107    
108                                    case '\u2013':
109                                            replacement = "&#x2013;";
110    
111                                            break;
112    
113                                    case '\u2014':
114                                            replacement = "&#x2014;";
115    
116                                            break;
117                            }
118    
119                            if (replacement != null) {
120                                    if (sb == null) {
121                                            sb = new StringBundler();
122                                    }
123    
124                                    if (i > lastReplacementIndex) {
125                                            sb.append(text.substring(lastReplacementIndex, i));
126                                    }
127    
128                                    sb.append(replacement);
129    
130                                    lastReplacementIndex = i + 1;
131                            }
132                    }
133    
134                    if (sb == null) {
135                            return text;
136                    }
137                    else {
138                            if (lastReplacementIndex < text.length()) {
139                                    sb.append(text.substring(lastReplacementIndex));
140                            }
141    
142                            return sb.toString();
143                    }
144            }
145    
146            @Override
147            public String escape(String text, int type) {
148                    if (text == null) {
149                            return null;
150                    }
151    
152                    if (text.length() == 0) {
153                            return StringPool.BLANK;
154                    }
155    
156                    String prefix = StringPool.BLANK;
157                    String postfix = StringPool.BLANK;
158    
159                    if (type == ESCAPE_MODE_ATTRIBUTE) {
160                            prefix = "&#x";
161                            postfix = StringPool.SEMICOLON;
162                    }
163                    else if (type == ESCAPE_MODE_CSS) {
164                            prefix = StringPool.BACK_SLASH;
165                    }
166                    else if (type == ESCAPE_MODE_JS) {
167                            prefix = "\\x";
168                    }
169                    else if (type == ESCAPE_MODE_URL) {
170                            return HttpUtil.encodeURL(text, true);
171                    }
172                    else {
173                            return escape(text);
174                    }
175    
176                    StringBuilder sb = new StringBuilder();
177    
178                    for (int i = 0; i < text.length(); i++) {
179                            char c = text.charAt(i);
180    
181                            if (Character.isLetterOrDigit(c) ||
182                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
183    
184                                    sb.append(c);
185                            }
186                            else {
187                                    sb.append(prefix);
188    
189                                    String hexString = StringUtil.toHexString(c);
190    
191                                    if (hexString.length() == 1) {
192                                            sb.append(StringPool.ASCII_TABLE[48]);
193                                    }
194    
195                                    sb.append(hexString);
196                                    sb.append(postfix);
197                            }
198                    }
199    
200                    if (sb.length() == text.length()) {
201                            return text;
202                    }
203                    else {
204                            return sb.toString();
205                    }
206            }
207    
208            @Override
209            public String escapeAttribute(String attribute) {
210                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
211            }
212    
213            @Override
214            public String escapeCSS(String css) {
215                    return escape(css, ESCAPE_MODE_CSS);
216            }
217    
218            @Override
219            public String escapeHREF(String href) {
220                    if (href == null) {
221                            return null;
222                    }
223    
224                    if (href.length() == 0) {
225                            return StringPool.BLANK;
226                    }
227    
228                    if (href.indexOf(StringPool.COLON) == 10) {
229                            String protocol = href.substring(0, 10).toLowerCase();
230    
231                            if (protocol.equals("javascript")) {
232                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
233                            }
234                    }
235    
236                    return escapeAttribute(href);
237            }
238    
239            @Override
240            public String escapeJS(String js) {
241                    return escape(js, ESCAPE_MODE_JS);
242            }
243    
244            @Override
245            public String escapeURL(String url) {
246                    return escape(url, ESCAPE_MODE_URL);
247            }
248    
249            @Override
250            public String escapeXPath(String xPath) {
251                    if (Validator.isNull(xPath)) {
252                            return xPath;
253                    }
254    
255                    StringBuilder sb = new StringBuilder(xPath.length());
256    
257                    for (int i = 0; i < xPath.length(); i++) {
258                            char c = xPath.charAt(i);
259    
260                            boolean hasToken = false;
261    
262                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
263                                    if (c == _XPATH_TOKENS[j]) {
264                                            hasToken = true;
265    
266                                            break;
267                                    }
268                            }
269    
270                            if (hasToken) {
271                                    sb.append(StringPool.UNDERLINE);
272                            }
273                            else {
274                                    sb.append(c);
275                            }
276                    }
277    
278                    return sb.toString();
279            }
280    
281            @Override
282            public String escapeXPathAttribute(String xPathAttribute) {
283                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
284                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
285    
286                    if (hasQuote && hasApostrophe) {
287                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
288    
289                            return "concat('".concat(
290                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
291                    }
292    
293                    if (hasQuote) {
294                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
295                                    StringPool.APOSTROPHE);
296                    }
297    
298                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
299            }
300    
301            @Override
302            public String extractText(String html) {
303                    if (html == null) {
304                            return null;
305                    }
306    
307                    Source source = new Source(html);
308    
309                    TextExtractor textExtractor = source.getTextExtractor();
310    
311                    return textExtractor.toString();
312            }
313    
314            @Override
315            public String fromInputSafe(String text) {
316                    return StringUtil.replace(text, "&amp;", "&");
317            }
318    
319            @Override
320            public String render(String html) {
321                    if (html == null) {
322                            return null;
323                    }
324    
325                    Source source = new Source(html);
326    
327                    Renderer renderer = source.getRenderer();
328    
329                    return renderer.toString();
330            }
331    
332            @Override
333            public String replaceMsWordCharacters(String text) {
334                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
335            }
336    
337            @Override
338            public String stripBetween(String text, String tag) {
339                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
340            }
341    
342            @Override
343            public String stripComments(String text) {
344                    return StringUtil.stripBetween(text, "<!--", "-->");
345            }
346    
347            @Override
348            public String stripHtml(String text) {
349                    if (text == null) {
350                            return null;
351                    }
352    
353                    text = stripComments(text);
354    
355                    StringBuilder sb = new StringBuilder(text.length());
356    
357                    int x = 0;
358                    int y = text.indexOf("<");
359    
360                    while (y != -1) {
361                            sb.append(text.substring(x, y));
362                            sb.append(StringPool.SPACE);
363    
364                            // Look for text enclosed by <abc></abc>
365    
366                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
367                                    y = stripTag(_TAG_SCRIPT, text, y);
368                            }
369                            else if (isTag(_TAG_STYLE, text, y + 1)) {
370                                    y = stripTag(_TAG_STYLE, text, y);
371                            }
372    
373                            x = text.indexOf(">", y);
374    
375                            if (x == -1) {
376                                    break;
377                            }
378    
379                            x++;
380    
381                            if (x < y) {
382    
383                                    // <b>Hello</b
384    
385                                    break;
386                            }
387    
388                            y = text.indexOf("<", x);
389                    }
390    
391                    if (y == -1) {
392                            sb.append(text.substring(x));
393                    }
394    
395                    return sb.toString();
396            }
397    
398            @Override
399            public String toInputSafe(String text) {
400                    return StringUtil.replace(
401                            text,
402                            new String[] {"&", "\""},
403                            new String[] {"&amp;", "&quot;"});
404            }
405    
406            @Override
407            public String unescape(String text) {
408                    if (text == null) {
409                            return null;
410                    }
411    
412                    if (text.length() == 0) {
413                            return StringPool.BLANK;
414                    }
415    
416                    // Optimize this
417    
418                    text = StringUtil.replace(text, "&lt;", "<");
419                    text = StringUtil.replace(text, "&gt;", ">");
420                    text = StringUtil.replace(text, "&amp;", "&");
421                    text = StringUtil.replace(text, "&#034;", "\"");
422                    text = StringUtil.replace(text, "&#039;", "'");
423                    text = StringUtil.replace(text, "&#040;", "(");
424                    text = StringUtil.replace(text, "&#041;", ")");
425                    text = StringUtil.replace(text, "&#044;", ",");
426                    text = StringUtil.replace(text, "&#035;", "#");
427                    text = StringUtil.replace(text, "&#037;", "%");
428                    text = StringUtil.replace(text, "&#059;", ";");
429                    text = StringUtil.replace(text, "&#061;", "=");
430                    text = StringUtil.replace(text, "&#043;", "+");
431                    text = StringUtil.replace(text, "&#045;", "-");
432    
433                    return text;
434            }
435    
436            @Override
437            public String unescapeCDATA(String text) {
438                    if (text == null) {
439                            return null;
440                    }
441    
442                    if (text.length() == 0) {
443                            return StringPool.BLANK;
444                    }
445    
446                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
447                    text = StringUtil.replace(text, "]]&gt;", "]]>");
448    
449                    return text;
450            }
451    
452            @Override
453            public String wordBreak(String text, int columns) {
454                    StringBundler sb = new StringBundler();
455    
456                    int length = 0;
457                    int lastWrite = 0;
458                    int pos = 0;
459    
460                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
461    
462                    Matcher matcher = pattern.matcher(text);
463    
464                    while (matcher.find()) {
465                            if (matcher.start() < pos) {
466                                    continue;
467                            }
468    
469                            while ((length + matcher.start() - pos) >= columns) {
470                                    pos += columns - length;
471    
472                                    sb.append(text.substring(lastWrite, pos));
473                                    sb.append("<wbr/>&shy;");
474    
475                                    length = 0;
476                                    lastWrite = pos;
477                            }
478    
479                            length += matcher.start() - pos;
480    
481                            String group = matcher.group();
482    
483                            if (group.equals(StringPool.AMPERSAND)) {
484                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
485    
486                                    if (x != -1) {
487                                            length++;
488                                            pos = x + 1;
489                                    }
490    
491                                    continue;
492                            }
493    
494                            if (group.equals(StringPool.LESS_THAN)) {
495                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
496    
497                                    if (x != -1) {
498                                            pos = x + 1;
499                                    }
500    
501                                    continue;
502                            }
503    
504                            if (group.equals(StringPool.SPACE) ||
505                                    group.equals(StringPool.NEW_LINE)) {
506    
507                                    length = 0;
508                                    pos = matcher.start() + 1;
509                            }
510                    }
511    
512                    sb.append(text.substring(lastWrite));
513    
514                    return sb.toString();
515            }
516    
517            protected boolean isTag(char[] tag, String text, int pos) {
518                    if ((pos + tag.length + 1) <= text.length()) {
519                            char item;
520    
521                            for (int i = 0; i < tag.length; i++) {
522                                    item = text.charAt(pos++);
523    
524                                    if (Character.toLowerCase(item) != tag[i]) {
525                                            return false;
526                                    }
527                            }
528    
529                            item = text.charAt(pos);
530    
531                            // Check that char after tag is not a letter (i.e. another tag)
532    
533                            return !Character.isLetter(item);
534                    }
535                    else {
536                            return false;
537                    }
538            }
539    
540            protected int stripTag(char[] tag, String text, int pos) {
541                    int x = pos + _TAG_SCRIPT.length;
542    
543                    // Find end of the tag
544    
545                    x = text.indexOf(">", x);
546    
547                    if (x < 0) {
548                            return pos;
549                    }
550    
551                    // Check if preceding character is / (i.e. is this instance of <abc/>)
552    
553                    if (text.charAt(x-1) == '/') {
554                            return pos;
555                    }
556    
557                    // Search for the ending </abc> tag
558    
559                    for (;;) {
560                            x = text.indexOf("</", x);
561    
562                            if (x >= 0) {
563                                    if (isTag(tag, text, x + 2)) {
564                                            pos = x;
565    
566                                            break;
567                                    }
568                                    else {
569    
570                                            // Skip past "</"
571    
572                                            x += 2;
573                                    }
574                            }
575                            else {
576                                    break;
577                            }
578                    }
579    
580                    return pos;
581            }
582    
583            private static final String[] _MS_WORD_HTML = new String[] {
584                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
585            };
586    
587            private static final String[] _MS_WORD_UNICODE = new String[] {
588                    "\u00ae", "\u2019", "\u201c", "\u201d"
589            };
590    
591            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
592    
593            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
594    
595            // See http://www.w3.org/TR/xpath20/#lexical-structure
596    
597            private static final char[] _XPATH_TOKENS = {
598                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
599                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
600    
601    }