001 /** 002 * Copyright (c) 2000-2012 Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.StringPool; 018 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 019 020 import java.util.regex.Matcher; 021 import java.util.regex.Pattern; 022 023 /** 024 * @author Jorge Ferrer 025 */ 026 public class MediaWikiToCreoleTranslator extends BaseTranslator { 027 028 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 029 030 public MediaWikiToCreoleTranslator() { 031 initRegexps(); 032 initNowikiRegexps(); 033 } 034 035 protected void initNowikiRegexps() { 036 037 // Preformat protected 038 039 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 040 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 041 042 // Escape protected 043 044 nowikiRegexps.add( 045 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 046 } 047 048 protected void initRegexps() { 049 050 // Clean unnecessary header emphasis 051 052 regexps.put("= '''([^=]+)''' =", "= $1 ="); 053 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 054 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 055 056 // Unscape angle brackets 057 058 regexps.put("<", "<"); 059 regexps.put(">", ">"); 060 061 // Remove categories 062 063 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 064 065 // Remove disambiguations 066 067 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 068 069 // Remove work in progress 070 071 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 072 073 // Bold and italics 074 075 regexps.put( 076 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 077 078 // Bold 079 080 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 081 082 // Italics 083 084 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 085 086 // Normalize URLs 087 088 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 089 090 // URL 091 092 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 093 094 // URL with label 095 096 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 097 098 // Term and definition 099 100 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 101 102 // Indented paragraph 103 104 regexps.put("^\\t:\\t(.*)", "$1"); 105 106 // Monospace 107 108 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 109 110 // No wiki 111 112 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 113 114 // HTML PRE 115 116 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 117 118 // User reference 119 120 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 121 } 122 123 @Override 124 protected String postProcess(String content) { 125 126 // LEP-6118 127 128 Matcher matcher = Pattern.compile( 129 "^=([^=]+)=", Pattern.MULTILINE).matcher(content); 130 131 if (matcher.find()) { 132 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 133 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 134 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 135 } 136 137 // Remove HTML tags 138 139 for (int i = 0; i < _HTML_TAGS.length; i++) { 140 content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK); 141 } 142 143 // Images 144 145 matcher = Pattern.compile( 146 "\\[{2}Image:([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content); 147 148 StringBuffer sb = new StringBuffer(content); 149 150 int offset = 0; 151 152 while (matcher.find()) { 153 String image = 154 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 155 matcher.group(1).toLowerCase() + "}}"; 156 157 sb.replace( 158 matcher.start(0) + offset, matcher.end(0) + offset, image); 159 160 offset += MediaWikiImporter.SHARED_IMAGES_TITLE.length() - 5; 161 } 162 163 content = sb.toString(); 164 165 // Remove underscores from links 166 167 matcher = Pattern.compile( 168 "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content); 169 170 sb = new StringBuffer(content); 171 172 while (matcher.find()) { 173 String link = matcher.group(1).replace( 174 StringPool.UNDERLINE, StringPool.SPACE); 175 176 sb.replace(matcher.start(1), matcher.end(1), link); 177 } 178 179 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 180 } 181 182 private static final String[] _HTML_TAGS = { 183 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 184 "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>", 185 "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>", 186 "</p>", "<tt>", "</tt>", "<var>", "</var>"}; 187 188 }