001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            @Override
055            public String escape(String text) {
056                    if (text == null) {
057                            return null;
058                    }
059    
060                    if (text.length() == 0) {
061                            return StringPool.BLANK;
062                    }
063    
064                    // Escape using XSS recommendations from
065                    // http://www.owasp.org/index.php/Cross_Site_Scripting
066                    // #How_to_Protect_Yourself
067    
068                    StringBundler sb = null;
069    
070                    int lastReplacementIndex = 0;
071    
072                    for (int i = 0; i < text.length(); i++) {
073                            char c = text.charAt(i);
074    
075                            String replacement = null;
076    
077                            switch (c) {
078                                    case '<':
079                                            replacement = "&lt;";
080    
081                                            break;
082    
083                                    case '>':
084                                            replacement = "&gt;";
085    
086                                            break;
087    
088                                    case '&':
089                                            replacement = "&amp;";
090    
091                                            break;
092    
093                                    case '"':
094                                            replacement = "&#034;";
095    
096                                            break;
097    
098                                    case '\'':
099                                            replacement = "&#039;";
100    
101                                            break;
102    
103                                    case '\u00bb': // '???'
104                                            replacement = "&#187;";
105    
106                                            break;
107    
108                                    case '\u2013':
109                                            replacement = "&#x2013;";
110    
111                                            break;
112    
113                                    case '\u2014':
114                                            replacement = "&#x2014;";
115    
116                                            break;
117                            }
118    
119                            if (replacement != null) {
120                                    if (sb == null) {
121                                            sb = new StringBundler();
122                                    }
123    
124                                    if (i > lastReplacementIndex) {
125                                            sb.append(text.substring(lastReplacementIndex, i));
126                                    }
127    
128                                    sb.append(replacement);
129    
130                                    lastReplacementIndex = i + 1;
131                            }
132                    }
133    
134                    if (sb == null) {
135                            return text;
136                    }
137                    else {
138                            if (lastReplacementIndex < text.length()) {
139                                    sb.append(text.substring(lastReplacementIndex));
140                            }
141    
142                            return sb.toString();
143                    }
144            }
145    
146            @Override
147            public String escape(String text, int type) {
148                    if (text == null) {
149                            return null;
150                    }
151    
152                    if (text.length() == 0) {
153                            return StringPool.BLANK;
154                    }
155    
156                    String prefix = StringPool.BLANK;
157                    String postfix = StringPool.BLANK;
158    
159                    if (type == ESCAPE_MODE_ATTRIBUTE) {
160                            prefix = "&#x";
161                            postfix = StringPool.SEMICOLON;
162                    }
163                    else if (type == ESCAPE_MODE_CSS) {
164                            prefix = StringPool.BACK_SLASH;
165                    }
166                    else if (type == ESCAPE_MODE_JS) {
167                            prefix = "\\x";
168                    }
169                    else if (type == ESCAPE_MODE_URL) {
170                            return HttpUtil.encodeURL(text, true);
171                    }
172                    else {
173                            return escape(text);
174                    }
175    
176                    StringBuilder sb = new StringBuilder();
177    
178                    for (int i = 0; i < text.length(); i++) {
179                            char c = text.charAt(i);
180    
181                            if ((c > 255) || Character.isLetterOrDigit(c) ||
182                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
183    
184                                    sb.append(c);
185                            }
186                            else {
187                                    sb.append(prefix);
188    
189                                    String hexString = StringUtil.toHexString(c);
190    
191                                    if (hexString.length() == 1) {
192                                            sb.append(StringPool.ASCII_TABLE[48]);
193                                    }
194    
195                                    sb.append(hexString);
196                                    sb.append(postfix);
197                            }
198                    }
199    
200                    if (sb.length() == text.length()) {
201                            return text;
202                    }
203                    else {
204                            return sb.toString();
205                    }
206            }
207    
208            @Override
209            public String escapeAttribute(String attribute) {
210                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
211            }
212    
213            @Override
214            public String escapeCSS(String css) {
215                    return escape(css, ESCAPE_MODE_CSS);
216            }
217    
218            @Override
219            public String escapeHREF(String href) {
220                    if (href == null) {
221                            return null;
222                    }
223    
224                    if (href.length() == 0) {
225                            return StringPool.BLANK;
226                    }
227    
228                    if (href.indexOf(StringPool.COLON) == 10) {
229                            String protocol = href.substring(0, 10).toLowerCase();
230    
231                            if (protocol.equals("javascript")) {
232                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
233                            }
234                    }
235    
236                    return escapeAttribute(href);
237            }
238    
239            @Override
240            public String escapeJS(String js) {
241                    return escape(js, ESCAPE_MODE_JS);
242            }
243    
244            @Override
245            public String escapeURL(String url) {
246                    return escape(url, ESCAPE_MODE_URL);
247            }
248    
249            @Override
250            public String escapeXPath(String xPath) {
251                    if (Validator.isNull(xPath)) {
252                            return xPath;
253                    }
254    
255                    StringBuilder sb = new StringBuilder(xPath.length());
256    
257                    for (int i = 0; i < xPath.length(); i++) {
258                            char c = xPath.charAt(i);
259    
260                            boolean hasToken = false;
261    
262                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
263                                    if (c == _XPATH_TOKENS[j]) {
264                                            hasToken = true;
265    
266                                            break;
267                                    }
268                            }
269    
270                            if (hasToken) {
271                                    sb.append(StringPool.UNDERLINE);
272                            }
273                            else {
274                                    sb.append(c);
275                            }
276                    }
277    
278                    return sb.toString();
279            }
280    
281            @Override
282            public String escapeXPathAttribute(String xPathAttribute) {
283                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
284                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
285    
286                    if (hasQuote && hasApostrophe) {
287                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
288    
289                            return "concat('".concat(
290                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
291                    }
292    
293                    if (hasQuote) {
294                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
295                                    StringPool.APOSTROPHE);
296                    }
297    
298                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
299            }
300    
301            @Override
302            public String extractText(String html) {
303                    if (html == null) {
304                            return null;
305                    }
306    
307                    Source source = new Source(html);
308    
309                    TextExtractor textExtractor = source.getTextExtractor();
310    
311                    return textExtractor.toString();
312            }
313    
314            @Override
315            public String fromInputSafe(String text) {
316                    return StringUtil.replace(text, "&amp;", "&");
317            }
318    
319            @Override
320            public String render(String html) {
321                    if (html == null) {
322                            return null;
323                    }
324    
325                    Source source = new Source(html);
326    
327                    Renderer renderer = source.getRenderer();
328    
329                    return renderer.toString();
330            }
331    
332            @Override
333            public String replaceMsWordCharacters(String text) {
334                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
335            }
336    
337            @Override
338            public String replaceNewLine(String text) {
339                    if (text == null) {
340                            return null;
341                    }
342    
343                    return text.replaceAll("\r?\n", "<br />");
344            }
345    
346            @Override
347            public String stripBetween(String text, String tag) {
348                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
349            }
350    
351            @Override
352            public String stripComments(String text) {
353                    return StringUtil.stripBetween(text, "<!--", "-->");
354            }
355    
356            @Override
357            public String stripHtml(String text) {
358                    if (text == null) {
359                            return null;
360                    }
361    
362                    text = stripComments(text);
363    
364                    StringBuilder sb = new StringBuilder(text.length());
365    
366                    int x = 0;
367                    int y = text.indexOf("<");
368    
369                    while (y != -1) {
370                            sb.append(text.substring(x, y));
371                            sb.append(StringPool.SPACE);
372    
373                            // Look for text enclosed by <abc></abc>
374    
375                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
376                                    y = stripTag(_TAG_SCRIPT, text, y);
377                            }
378                            else if (isTag(_TAG_STYLE, text, y + 1)) {
379                                    y = stripTag(_TAG_STYLE, text, y);
380                            }
381    
382                            x = text.indexOf(">", y);
383    
384                            if (x == -1) {
385                                    break;
386                            }
387    
388                            x++;
389    
390                            if (x < y) {
391    
392                                    // <b>Hello</b
393    
394                                    break;
395                            }
396    
397                            y = text.indexOf("<", x);
398                    }
399    
400                    if (y == -1) {
401                            sb.append(text.substring(x));
402                    }
403    
404                    return sb.toString();
405            }
406    
407            @Override
408            public String toInputSafe(String text) {
409                    return StringUtil.replace(
410                            text,
411                            new String[] {"&", "\""},
412                            new String[] {"&amp;", "&quot;"});
413            }
414    
415            @Override
416            public String unescape(String text) {
417                    if (text == null) {
418                            return null;
419                    }
420    
421                    if (text.length() == 0) {
422                            return StringPool.BLANK;
423                    }
424    
425                    // Optimize this
426    
427                    text = StringUtil.replace(text, "&lt;", "<");
428                    text = StringUtil.replace(text, "&gt;", ">");
429                    text = StringUtil.replace(text, "&amp;", "&");
430                    text = StringUtil.replace(text, "&#034;", "\"");
431                    text = StringUtil.replace(text, "&#039;", "'");
432                    text = StringUtil.replace(text, "&#040;", "(");
433                    text = StringUtil.replace(text, "&#041;", ")");
434                    text = StringUtil.replace(text, "&#044;", ",");
435                    text = StringUtil.replace(text, "&#035;", "#");
436                    text = StringUtil.replace(text, "&#037;", "%");
437                    text = StringUtil.replace(text, "&#059;", ";");
438                    text = StringUtil.replace(text, "&#061;", "=");
439                    text = StringUtil.replace(text, "&#043;", "+");
440                    text = StringUtil.replace(text, "&#045;", "-");
441    
442                    return text;
443            }
444    
445            @Override
446            public String unescapeCDATA(String text) {
447                    if (text == null) {
448                            return null;
449                    }
450    
451                    if (text.length() == 0) {
452                            return StringPool.BLANK;
453                    }
454    
455                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
456                    text = StringUtil.replace(text, "]]&gt;", "]]>");
457    
458                    return text;
459            }
460    
461            @Override
462            public String wordBreak(String text, int columns) {
463                    StringBundler sb = new StringBundler();
464    
465                    int length = 0;
466                    int lastWrite = 0;
467                    int pos = 0;
468    
469                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
470    
471                    Matcher matcher = pattern.matcher(text);
472    
473                    while (matcher.find()) {
474                            if (matcher.start() < pos) {
475                                    continue;
476                            }
477    
478                            while ((length + matcher.start() - pos) >= columns) {
479                                    pos += columns - length;
480    
481                                    sb.append(text.substring(lastWrite, pos));
482                                    sb.append("<wbr/>&shy;");
483    
484                                    length = 0;
485                                    lastWrite = pos;
486                            }
487    
488                            length += matcher.start() - pos;
489    
490                            String group = matcher.group();
491    
492                            if (group.equals(StringPool.AMPERSAND)) {
493                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
494    
495                                    if (x != -1) {
496                                            length++;
497                                            pos = x + 1;
498                                    }
499    
500                                    continue;
501                            }
502    
503                            if (group.equals(StringPool.LESS_THAN)) {
504                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
505    
506                                    if (x != -1) {
507                                            pos = x + 1;
508                                    }
509    
510                                    continue;
511                            }
512    
513                            if (group.equals(StringPool.SPACE) ||
514                                    group.equals(StringPool.NEW_LINE)) {
515    
516                                    length = 0;
517                                    pos = matcher.start() + 1;
518                            }
519                    }
520    
521                    sb.append(text.substring(lastWrite));
522    
523                    return sb.toString();
524            }
525    
526            protected boolean isTag(char[] tag, String text, int pos) {
527                    if ((pos + tag.length + 1) <= text.length()) {
528                            char item;
529    
530                            for (int i = 0; i < tag.length; i++) {
531                                    item = text.charAt(pos++);
532    
533                                    if (Character.toLowerCase(item) != tag[i]) {
534                                            return false;
535                                    }
536                            }
537    
538                            item = text.charAt(pos);
539    
540                            // Check that char after tag is not a letter (i.e. another tag)
541    
542                            return !Character.isLetter(item);
543                    }
544                    else {
545                            return false;
546                    }
547            }
548    
549            protected int stripTag(char[] tag, String text, int pos) {
550                    int x = pos + _TAG_SCRIPT.length;
551    
552                    // Find end of the tag
553    
554                    x = text.indexOf(">", x);
555    
556                    if (x < 0) {
557                            return pos;
558                    }
559    
560                    // Check if preceding character is / (i.e. is this instance of <abc/>)
561    
562                    if (text.charAt(x-1) == '/') {
563                            return pos;
564                    }
565    
566                    // Search for the ending </abc> tag
567    
568                    for (;;) {
569                            x = text.indexOf("</", x);
570    
571                            if (x >= 0) {
572                                    if (isTag(tag, text, x + 2)) {
573                                            pos = x;
574    
575                                            break;
576                                    }
577                                    else {
578    
579                                            // Skip past "</"
580    
581                                            x += 2;
582                                    }
583                            }
584                            else {
585                                    break;
586                            }
587                    }
588    
589                    return pos;
590            }
591    
592            private static final String[] _MS_WORD_HTML = new String[] {
593                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
594            };
595    
596            private static final String[] _MS_WORD_UNICODE = new String[] {
597                    "\u00ae", "\u2019", "\u201c", "\u201d"
598            };
599    
600            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
601    
602            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
603    
604            // See http://www.w3.org/TR/xpath20/#lexical-structure
605    
606            private static final char[] _XPATH_TOKENS = {
607                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
608                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
609    
610    }