001
014
015 package com.liferay.portal.util;
016
017 import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018 import com.liferay.portal.kernel.util.CharPool;
019 import com.liferay.portal.kernel.util.Html;
020 import com.liferay.portal.kernel.util.HttpUtil;
021 import com.liferay.portal.kernel.util.StringBundler;
022 import com.liferay.portal.kernel.util.StringPool;
023 import com.liferay.portal.kernel.util.StringUtil;
024 import com.liferay.portal.kernel.util.Validator;
025
026 import java.util.regex.Matcher;
027 import java.util.regex.Pattern;
028
029 import net.htmlparser.jericho.Renderer;
030 import net.htmlparser.jericho.Source;
031 import net.htmlparser.jericho.TextExtractor;
032
033
041 @DoPrivileged
042 public class HtmlImpl implements Html {
043
044 public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045
046 public static final int ESCAPE_MODE_CSS = 2;
047
048 public static final int ESCAPE_MODE_JS = 3;
049
050 public static final int ESCAPE_MODE_TEXT = 4;
051
052 public static final int ESCAPE_MODE_URL = 5;
053
054
061 @Override
062 public String escape(String text) {
063 if (text == null) {
064 return null;
065 }
066
067 if (text.length() == 0) {
068 return StringPool.BLANK;
069 }
070
071
072
073
074
075 StringBundler sb = null;
076
077 int lastReplacementIndex = 0;
078
079 for (int i = 0; i < text.length(); i++) {
080 char c = text.charAt(i);
081
082 String replacement = null;
083
084 if (c == '<') {
085 replacement = "<";
086 }
087 else if (c == '>') {
088 replacement = ">";
089 }
090 else if (c == '&') {
091 replacement = "&";
092 }
093 else if (c == '"') {
094 replacement = """;
095 }
096 else if (c == '\'') {
097 replacement = "'";
098 }
099 else if (c == '\u00bb') {
100 replacement = "»";
101 }
102 else if (c == '\u2013') {
103 replacement = "–";
104 }
105 else if (c == '\u2014') {
106 replacement = "—";
107 }
108 else if (c == '\u2028') {
109 replacement = "舲";
110 }
111 else if (!_isValidXmlCharacter(c) ||
112 _isUnicodeCompatibilityCharacter(c)) {
113
114 replacement = StringPool.SPACE;
115 }
116
117 if (replacement != null) {
118 if (sb == null) {
119 sb = new StringBundler();
120 }
121
122 if (i > lastReplacementIndex) {
123 sb.append(text.substring(lastReplacementIndex, i));
124 }
125
126 sb.append(replacement);
127
128 lastReplacementIndex = i + 1;
129 }
130 }
131
132 if (sb == null) {
133 return text;
134 }
135
136 if (lastReplacementIndex < text.length()) {
137 sb.append(text.substring(lastReplacementIndex));
138 }
139
140 return sb.toString();
141 }
142
143 @Override
144 public String escape(String text, int type) {
145 if (text == null) {
146 return null;
147 }
148
149 if (text.length() == 0) {
150 return StringPool.BLANK;
151 }
152
153 String prefix = StringPool.BLANK;
154 String postfix = StringPool.BLANK;
155
156 if (type == ESCAPE_MODE_ATTRIBUTE) {
157 prefix = "&#x";
158 postfix = StringPool.SEMICOLON;
159 }
160 else if (type == ESCAPE_MODE_CSS) {
161 prefix = StringPool.BACK_SLASH;
162 }
163 else if (type == ESCAPE_MODE_JS) {
164 prefix = "\\x";
165 }
166 else if (type == ESCAPE_MODE_URL) {
167 return HttpUtil.encodeURL(text, true);
168 }
169 else {
170 return escape(text);
171 }
172
173 StringBuilder sb = new StringBuilder();
174
175 for (int i = 0; i < text.length(); i++) {
176 char c = text.charAt(i);
177
178 if ((type == ESCAPE_MODE_ATTRIBUTE) &&
179 (!_isValidXmlCharacter(c) ||
180 _isUnicodeCompatibilityCharacter(c))) {
181
182 sb.append(StringPool.SPACE);
183 }
184 else if ((c > 255) || Character.isLetterOrDigit(c) ||
185 (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
186
187 sb.append(c);
188 }
189 else {
190 sb.append(prefix);
191
192 String hexString = StringUtil.toHexString(c);
193
194 if (hexString.length() == 1) {
195 sb.append(StringPool.ASCII_TABLE[48]);
196 }
197
198 sb.append(hexString);
199 sb.append(postfix);
200 }
201 }
202
203 if ((type != ESCAPE_MODE_ATTRIBUTE) && (sb.length() == text.length())) {
204 return text;
205 }
206 else {
207 return sb.toString();
208 }
209 }
210
211 @Override
212 public String escapeAttribute(String attribute) {
213 return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
214 }
215
216 @Override
217 public String escapeCSS(String css) {
218 return escape(css, ESCAPE_MODE_CSS);
219 }
220
221 @Override
222 public String escapeHREF(String href) {
223 if (href == null) {
224 return null;
225 }
226
227 if (href.length() == 0) {
228 return StringPool.BLANK;
229 }
230
231 int index = href.indexOf(StringPool.COLON);
232
233 if (index == 4) {
234 String protocol = StringUtil.toLowerCase(href.substring(0, 4));
235
236 if (protocol.equals("data")) {
237 href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
238 }
239 }
240 else if (index == 10) {
241 String protocol = StringUtil.toLowerCase(href.substring(0, 10));
242
243 if (protocol.equals("javascript")) {
244 href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
245 }
246 }
247
248 return escapeAttribute(href);
249 }
250
251 @Override
252 public String escapeJS(String js) {
253 return escape(js, ESCAPE_MODE_JS);
254 }
255
256 @Override
257 public String escapeURL(String url) {
258 return escape(url, ESCAPE_MODE_URL);
259 }
260
261 @Override
262 public String escapeXPath(String xPath) {
263 if (Validator.isNull(xPath)) {
264 return xPath;
265 }
266
267 StringBuilder sb = new StringBuilder(xPath.length());
268
269 for (int i = 0; i < xPath.length(); i++) {
270 char c = xPath.charAt(i);
271
272 boolean hasToken = false;
273
274 for (int j = 0; j < _XPATH_TOKENS.length; j++) {
275 if (c == _XPATH_TOKENS[j]) {
276 hasToken = true;
277
278 break;
279 }
280 }
281
282 if (hasToken) {
283 sb.append(StringPool.UNDERLINE);
284 }
285 else {
286 sb.append(c);
287 }
288 }
289
290 return sb.toString();
291 }
292
293 @Override
294 public String escapeXPathAttribute(String xPathAttribute) {
295 boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
296 boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
297
298 if (hasQuote && hasApostrophe) {
299 String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
300
301 return "concat('".concat(
302 StringUtil.merge(parts, "', \"'\", '")).concat("')");
303 }
304
305 if (hasQuote) {
306 return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
307 StringPool.APOSTROPHE);
308 }
309
310 return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
311 }
312
313 @Override
314 public String extractText(String html) {
315 if (html == null) {
316 return null;
317 }
318
319 Source source = new Source(html);
320
321 TextExtractor textExtractor = source.getTextExtractor();
322
323 return textExtractor.toString();
324 }
325
326 @Override
327 public String fromInputSafe(String text) {
328 return StringUtil.replace(text, "&", "&");
329 }
330
331 @Override
332 public String getAUICompatibleId(String text) {
333 if (Validator.isNull(text)) {
334 return text;
335 }
336
337 StringBundler sb = null;
338
339 int lastReplacementIndex = 0;
340
341 for (int i = 0; i < text.length(); i++) {
342 char c = text.charAt(i);
343
344 if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
345 ((c > 127) && (c != CharPool.FIGURE_SPACE) &&
346 (c != CharPool.NARROW_NO_BREAK_SPACE) &&
347 (c != CharPool.NO_BREAK_SPACE))) {
348
349 continue;
350 }
351
352 if (sb == null) {
353 sb = new StringBundler();
354 }
355
356 if (i > lastReplacementIndex) {
357 sb.append(text.substring(lastReplacementIndex, i));
358 }
359
360 sb.append(CharPool.UNDERLINE);
361
362 if (c != CharPool.UNDERLINE) {
363 sb.append(StringUtil.toHexString(c));
364 }
365
366 sb.append(CharPool.UNDERLINE);
367
368 lastReplacementIndex = i + 1;
369 }
370
371 if (sb == null) {
372 return text;
373 }
374
375 if (lastReplacementIndex < text.length()) {
376 sb.append(text.substring(lastReplacementIndex));
377 }
378
379 return sb.toString();
380 }
381
382 @Deprecated
383 @Override
384 public String render(String html) {
385 if (html == null) {
386 return null;
387 }
388
389 Source source = new Source(html);
390
391 Renderer renderer = source.getRenderer();
392
393 return renderer.toString();
394 }
395
396 @Override
397 public String replaceMsWordCharacters(String text) {
398 return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
399 }
400
401 @Override
402 public String replaceNewLine(String text) {
403 if (text == null) {
404 return null;
405 }
406
407 return text.replaceAll("\r?\n", "<br />");
408 }
409
410 @Override
411 public String stripBetween(String text, String tag) {
412 return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
413 }
414
415 @Override
416 public String stripComments(String text) {
417 return StringUtil.stripBetween(text, "<!--", "-->");
418 }
419
420 @Override
421 public String stripHtml(String text) {
422 if (text == null) {
423 return null;
424 }
425
426 text = stripComments(text);
427
428 StringBuilder sb = new StringBuilder(text.length());
429
430 int x = 0;
431 int y = text.indexOf("<");
432
433 while (y != -1) {
434 sb.append(text.substring(x, y));
435 sb.append(StringPool.SPACE);
436
437
438
439 if (isTag(_TAG_SCRIPT, text, y + 1)) {
440 y = stripTag(_TAG_SCRIPT, text, y);
441 }
442 else if (isTag(_TAG_STYLE, text, y + 1)) {
443 y = stripTag(_TAG_STYLE, text, y);
444 }
445
446 x = text.indexOf(">", y);
447
448 if (x == -1) {
449 break;
450 }
451
452 x++;
453
454 if (x < y) {
455
456
457
458 break;
459 }
460
461 y = text.indexOf("<", x);
462 }
463
464 if (y == -1) {
465 sb.append(text.substring(x));
466 }
467
468 return sb.toString();
469 }
470
471 @Override
472 public String toInputSafe(String text) {
473 return StringUtil.replace(
474 text,
475 new String[] {"&", "\""},
476 new String[] {"&", """});
477 }
478
479 @Override
480 public String unescape(String text) {
481 if (text == null) {
482 return null;
483 }
484
485 if (text.length() == 0) {
486 return StringPool.BLANK;
487 }
488
489
490
491 text = StringUtil.replace(text, "<", "<");
492 text = StringUtil.replace(text, ">", ">");
493 text = StringUtil.replace(text, "&", "&");
494 text = StringUtil.replace(text, "’", "\u2019");
495 text = StringUtil.replace(text, """, "\"");
496 text = StringUtil.replace(text, "'", "'");
497 text = StringUtil.replace(text, "(", "(");
498 text = StringUtil.replace(text, ")", ")");
499 text = StringUtil.replace(text, ",", ",");
500 text = StringUtil.replace(text, "#", "#");
501 text = StringUtil.replace(text, "%", "%");
502 text = StringUtil.replace(text, ";", ";");
503 text = StringUtil.replace(text, "=", "=");
504 text = StringUtil.replace(text, "+", "+");
505 text = StringUtil.replace(text, "-", "-");
506
507 return text;
508 }
509
510 @Override
511 public String unescapeCDATA(String text) {
512 if (text == null) {
513 return null;
514 }
515
516 if (text.length() == 0) {
517 return StringPool.BLANK;
518 }
519
520 text = StringUtil.replace(text, "<![CDATA[", "<![CDATA[");
521 text = StringUtil.replace(text, "]]>", "]]>");
522
523 return text;
524 }
525
526 @Override
527 public String wordBreak(String text, int columns) {
528 StringBundler sb = new StringBundler();
529
530 int length = 0;
531 int lastWrite = 0;
532 int pos = 0;
533
534 Pattern pattern = Pattern.compile("([\\s<&]|$)");
535
536 Matcher matcher = pattern.matcher(text);
537
538 while (matcher.find()) {
539 if (matcher.start() < pos) {
540 continue;
541 }
542
543 while ((length + matcher.start() - pos) >= columns) {
544 pos += columns - length;
545
546 sb.append(text.substring(lastWrite, pos));
547 sb.append("<wbr/>­");
548
549 length = 0;
550 lastWrite = pos;
551 }
552
553 length += matcher.start() - pos;
554
555 String group = matcher.group();
556
557 if (group.equals(StringPool.AMPERSAND)) {
558 int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
559
560 if (x != -1) {
561 length++;
562 pos = x + 1;
563 }
564
565 continue;
566 }
567
568 if (group.equals(StringPool.LESS_THAN)) {
569 int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
570
571 if (x != -1) {
572 pos = x + 1;
573 }
574
575 continue;
576 }
577
578 if (group.equals(StringPool.SPACE) ||
579 group.equals(StringPool.NEW_LINE)) {
580
581 length = 0;
582 pos = matcher.start() + 1;
583 }
584 }
585
586 sb.append(text.substring(lastWrite));
587
588 return sb.toString();
589 }
590
591 protected boolean isTag(char[] tag, String text, int pos) {
592 if ((pos + tag.length + 1) <= text.length()) {
593 char item;
594
595 for (int i = 0; i < tag.length; i++) {
596 item = text.charAt(pos++);
597
598 if (Character.toLowerCase(item) != tag[i]) {
599 return false;
600 }
601 }
602
603 item = text.charAt(pos);
604
605
606
607 return !Character.isLetter(item);
608 }
609 else {
610 return false;
611 }
612 }
613
614 protected int stripTag(char[] tag, String text, int pos) {
615 int x = pos + _TAG_SCRIPT.length;
616
617
618
619 x = text.indexOf(">", x);
620
621 if (x < 0) {
622 return pos;
623 }
624
625
626
627 if (text.charAt(x-1) == '/') {
628 return pos;
629 }
630
631
632
633 while (true) {
634 x = text.indexOf("</", x);
635
636 if (x >= 0) {
637 if (isTag(tag, text, x + 2)) {
638 pos = x;
639
640 break;
641 }
642 else {
643
644
645
646 x += 2;
647 }
648 }
649 else {
650 break;
651 }
652 }
653
654 return pos;
655 }
656
657 private boolean _isUnicodeCompatibilityCharacter(char c) {
658 if (((c >= '\u007f') && (c <= '\u0084')) ||
659 ((c >= '\u0086') && (c <= '\u009f')) ||
660 ((c >= '\ufdd0') && (c <= '\ufdef'))) {
661
662 return true;
663 }
664
665 return false;
666 }
667
668 private boolean _isValidXmlCharacter(char c) {
669 if ((c == '\u0009') || (c == CharPool.NEW_LINE) ||
670 (c == CharPool.RETURN) || ((c >= '\u0020') && (c <= '\ud7ff')) ||
671 ((c >= '\ue000') && (c <= '\ufffd')) ||
672 Character.isLowSurrogate(c) || Character.isHighSurrogate(c)) {
673
674 return true;
675 }
676
677 return false;
678 }
679
680 private static final String[] _MS_WORD_HTML = new String[] {
681 "®", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
682 };
683
684 private static final String[] _MS_WORD_UNICODE = new String[] {
685 "\u00ae", "\u2019", "\u201c", "\u201d"
686 };
687
688 private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
689
690 private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
691
692
693
694 private static final char[] _XPATH_TOKENS = {
695 '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
696 '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
697
698 }