001 /** 002 * Copyright (c) 2000-present Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.CharPool; 018 import com.liferay.portal.kernel.util.StringPool; 019 import com.liferay.portal.kernel.util.StringUtil; 020 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 021 022 import java.util.regex.Matcher; 023 import java.util.regex.Pattern; 024 025 /** 026 * @author Jorge Ferrer 027 * @author Daniel Kocsis 028 */ 029 public class MediaWikiToCreoleTranslator extends BaseTranslator { 030 031 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 032 033 public MediaWikiToCreoleTranslator() { 034 initRegexps(); 035 initNowikiRegexps(); 036 } 037 038 public boolean isStrictImportMode() { 039 return _strictImportMode; 040 } 041 042 public void setStrictImportMode(boolean strictImportMode) { 043 _strictImportMode = strictImportMode; 044 } 045 046 protected void initNowikiRegexps() { 047 048 // Preformat protected 049 050 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 051 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 052 053 // Escape protected 054 055 nowikiRegexps.add( 056 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 057 } 058 059 protected void initRegexps() { 060 061 // Clean unnecessary header emphasis 062 063 regexps.put("= '''([^=]+)''' =", "= $1 ="); 064 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 065 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 066 067 // Unscape angle brackets 068 069 regexps.put("<", "<"); 070 regexps.put(">", ">"); 071 072 // Remove categories 073 074 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 075 076 // Remove disambiguations 077 078 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 079 080 // Remove work in progress 081 082 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 083 084 // Remove references 085 086 regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK); 087 088 // Bold and italics 089 090 regexps.put( 091 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 092 093 // Bold 094 095 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 096 097 // Italics 098 099 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 100 101 // Normalize URLs 102 103 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 104 105 // URL 106 107 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 108 109 // URL with label 110 111 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 112 113 // Term and definition 114 115 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 116 117 // Indented paragraph 118 119 regexps.put("^\\t:\\t(.*)", "$1"); 120 121 // Monospace 122 123 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 124 125 // No wiki 126 127 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 128 129 // HTML PRE 130 131 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 132 133 // User reference 134 135 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 136 } 137 138 @Override 139 protected String postProcess(String content) { 140 if (_strictImportMode) { 141 content = runRegexp( 142 content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK); 143 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK); 144 content = runRegexp( 145 content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK); 146 } 147 else { 148 content = runRegexp( 149 content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n"); 150 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}"); 151 content = runRegexp( 152 content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3"); 153 content = runRegexp( 154 content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3"); 155 } 156 157 // LEP-6118 158 159 Matcher matcher = _titlePattern.matcher(content); 160 161 if (matcher.find()) { 162 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 163 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 164 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 165 } 166 167 // Remove HTML tags 168 169 for (Pattern pattern : _htmlTagPatterns) { 170 matcher = pattern.matcher(content); 171 172 content = matcher.replaceAll(StringPool.BLANK); 173 } 174 175 for (String htmlTag : _HTML_TAGS) { 176 content = StringUtil.replace(content, htmlTag, StringPool.BLANK); 177 } 178 179 // Images 180 181 matcher = _imagePattern.matcher(content); 182 183 StringBuffer sb = new StringBuffer(content); 184 185 int level = 0; 186 int offset = 0; 187 int originalLength = 0; 188 int prefixLength = 0; 189 190 while (matcher.find()) { 191 level = 0; 192 prefixLength = matcher.end(2) - matcher.start(2); 193 194 for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) { 195 if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) { 196 level++; 197 } 198 else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) { 199 level--; 200 201 if (level == 0) { 202 originalLength = (i + 2) - (matcher.start(0) + offset); 203 204 break; 205 } 206 } 207 } 208 209 int imageStartPos = matcher.end(3) + offset; 210 int imageEndPos = matcher.start(2) + offset + originalLength - 4; 211 212 String image = 213 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 214 StringUtil.toLowerCase( 215 sb.substring(imageStartPos, imageEndPos)) + 216 "}}"; 217 218 int imageLength = image.length(); 219 220 image = StringUtil.replace(image, "[[", StringPool.BLANK); 221 image = StringUtil.replace(image, "]]", StringPool.BLANK); 222 223 sb.replace( 224 matcher.start(0) + offset, 225 matcher.start(0) + originalLength + offset, image); 226 227 offset += 228 MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength - 229 (imageLength - image.length()); 230 } 231 232 content = sb.toString(); 233 234 // Tables 235 236 matcher = _tablePattern.matcher(content); 237 238 sb = new StringBuffer(content); 239 240 String mediaWikiTable = null; 241 242 offset = 0; 243 originalLength = 0; 244 245 while (matcher.find()) { 246 mediaWikiTable = sb.substring( 247 matcher.start(1) + offset, matcher.end(1) + offset); 248 249 originalLength = mediaWikiTable.length() + 4; 250 251 Matcher matcher1 = _mediaWikiTablePattern1.matcher(mediaWikiTable); 252 253 mediaWikiTable = matcher1.replaceAll(StringPool.BLANK); 254 255 Matcher matcher2 = _mediaWikiTablePattern2.matcher(mediaWikiTable); 256 257 mediaWikiTable = matcher2.replaceAll("$1"); 258 259 Matcher matcher3 = _mediaWikiTablePattern3.matcher(mediaWikiTable); 260 261 mediaWikiTable = matcher3.replaceAll("===$1==="); 262 263 Matcher matcher4 = _mediaWikiTablePattern4.matcher(mediaWikiTable); 264 265 mediaWikiTable = matcher4.replaceAll("|=$1|"); 266 267 mediaWikiTable = StringUtil.replace( 268 mediaWikiTable, CharPool.NEW_LINE, StringPool.BLANK); 269 mediaWikiTable = StringUtil.replace( 270 mediaWikiTable, CharPool.RETURN, StringPool.BLANK); 271 mediaWikiTable = StringUtil.replace(mediaWikiTable, "|-", "\n\r"); 272 mediaWikiTable = StringUtil.replace(mediaWikiTable, "||", "|"); 273 mediaWikiTable = StringUtil.replace( 274 mediaWikiTable, "////", StringPool.BLANK); 275 276 sb.replace( 277 matcher.start(0) + offset, 278 matcher.start(0) + originalLength + offset, mediaWikiTable); 279 280 offset += mediaWikiTable.length() - originalLength; 281 } 282 283 content = sb.toString(); 284 285 content = runRegexp(content, "/{2}(\\{{3})", "$1"); 286 content = runRegexp(content, "(\\}{3})/{2}", "$1"); 287 288 // Remove underscores from links 289 290 matcher = _linkPattern.matcher(content); 291 292 sb = new StringBuffer(content); 293 294 while (matcher.find()) { 295 String link = matcher.group(1).replace( 296 StringPool.UNDERLINE, StringPool.SPACE); 297 298 sb.replace(matcher.start(1), matcher.end(1), link); 299 } 300 301 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 302 } 303 304 private static final String[] _HTML_TAGS = { 305 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 306 "</center>", "<cite>", "</cite>","<code>", "</code>", "</div>", 307 "</font>", "<hr>", "<hr/>", "<hr />", "<p>", "</p>", "<tt>", "</tt>", 308 "<var>", "</var>" 309 }; 310 311 private Pattern[] _htmlTagPatterns = { 312 Pattern.compile("<div[^>]*>"), Pattern.compile("<font[^>]*>")}; 313 private Pattern _imagePattern = Pattern.compile( 314 "(\\[{2})(Image|File)(:)", Pattern.DOTALL); 315 private Pattern _linkPattern = Pattern.compile( 316 "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL); 317 private Pattern _mediaWikiTablePattern1 = Pattern.compile( 318 "class=(.*?)[|\n\r]"); 319 private Pattern _mediaWikiTablePattern2 = Pattern.compile("(\\|\\-)(.*)"); 320 private Pattern _mediaWikiTablePattern3 = Pattern.compile("\\|\\+(.*)"); 321 private Pattern _mediaWikiTablePattern4 = Pattern.compile("(?m)^!(.+)"); 322 private boolean _strictImportMode; 323 private Pattern _tablePattern = Pattern.compile( 324 "\\{\\|(.*?)\\|\\}", Pattern.DOTALL); 325 private Pattern _titlePattern = Pattern.compile( 326 "^=([^=]+)=", Pattern.MULTILINE); 327 328 }