Mercurial > public > html2wiki
view src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 17:a88e2f8fb117 tip
added a replace transformer to remove ;) smilies
author | ferrall@nwoca.org |
---|---|
date | Tue, 08 Feb 2011 09:27:40 -0500 |
parents | 001e43423d5d |
children |
line wrap: on
line source
package org.nwoca.ssdt.tools.html2wiki; /* * Html2Wiki.java * * Created on May 9, 2006, 3:22 PM * */ import java.io.*; import java.util.Collection; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.FileUtils; import java.util.regex.*; /** * Converter to convert HTML documents into MediaWiki test. * * Heavily customized to handle HTML produced by DEC DOCUMENT * SOFTARE doctype. Breaks file into Chapters in the manner done * by Document. Needs modification to work with other HTML files. * * @author SMITH */ public class Html2Wiki { private StringBuffer buffer; private Collection<Transformer> transformers; private boolean converted = false; private static String category; /** Creates a new instance of Html2Wiki. */ public Html2Wiki(String html) { buffer = new StringBuffer(html); transformers = new ArrayList<Transformer>(); transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); transformers.add(new DeleteTransformer("<a .*?>|</a>")); transformers.add(new DeleteTransformer("(?m)^\\*")); transformers.add(new DeleteTransformer("(?m)<br>$")); transformers.add(new DeleteTransformer("<caption>.*</caption>")); // remove SDML captions (used for TOC) transformers.add(new DeleteTransformer("<font .*?>|</font>")); transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); transformers.add(new BadTableDataTransformer()); transformers.add(new BadTableRowTransformer()); transformers.add(new ReflowTransformer()); transformers.add(new DeleteTransformer("<p>")); transformers.add(new ReplaceTransformer("\\{","\\{")); // Escape braces transformers.add(new ReplaceTransformer("\\}","\\}")); transformers.add(new ReplaceTransformer("\\[","\\[")); // Escape brackets transformers.add(new ReplaceTransformer("\\]","\\]")); transformers.add(new PreTagTransformer()); // Unescape brackets inside <pre> // transformers.add(new ReplaceTransformer("<br>","\\\\")); //replace table tag preserving border setting. transformers.add(new TagTransformer("<table\\sborder=(\\d).*?>", true, "{table:border=", "|width=75%}")); transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}")); transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}")); transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}")); transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}")); transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); transformers.add(new ReplaceTransformer("<li>","{li}")); transformers.add(new ReplaceTransformer("\\n\\s*</li>","{li}\n")); // remove leading space from </li> transformers.add(new ReplaceTransformer("</li>","{li}\n")); // Replace remaining </li> transformers.add(new ReplaceTransformer("\\u001B"," ")); // Replace ASCII ESC character transformers.add(new ChapterTransformer(category)); transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*")); transformers.add(new TagTransformer("<strong>(.*?)</strong>", true, "*","*")); transformers.add(new TagTransformer("<u>(.*?)</u>" , "+","+")); transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "{{", "}}")); transformers.add(new TagTransformer("<h1>(.*)</h1>", "h1. ", "")); transformers.add(new TagTransformer("<h2.*>(.*)</h2>", "h2. ", "")); transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","h3.", "")); transformers.add(new TagTransformer("<h3>(.*)</H3>", "h3. ", "")); transformers.add(new TagTransformer("<h3>(.*)</h3>", "h3. ", "")); transformers.add(new TagTransformer("<h4>(.*)</h4>", "h4. ", "")); transformers.add(new TagTransformer("<h5>(.*)</h5>", "h5. ", "")); transformers.add(new TagTransformer("<h6>(.*)</h6>", "h6. ", "")); //Replace Notes with Info tags. transformers.add(new ReplaceTransformer("\\{center}\\n\\{table:border=\\d.*}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{info}")); transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{info}")); //Remove unnecessary table surrounding code blocks. transformers.add(new ReplaceTransformer("\\{table:.*\\}(\\n|\\s|\\{t.\\}|\\*\\S*\\*)*\\{code\\}","{code}")); transformers.add(new ReplaceTransformer("\\{code\\}(\\n|\\{t.\\}|\\s)*\\{table\\}","{code}")); //Change borderStyle of code window for "screenshots" to none. transformers.add(new TagTransformer("\\{code\\}([\\s\\n]*?_______________)", true, "{code:borderStyle=none}", "")); transformers.add(new TagTransformer("<blockquote>(.*?)</blockquote>", true, "{quote}", "{quote}")); transformers.add(new DeleteTransformer("(?s)<hr.*?>")); transformers.add(new ReflowTransformer("(\\{info\\})([^\\{]*)(\\{info\\})")); transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})")); transformers.add(new ReflowTransformer("(\\{td\\})([^\\{]*)(\\{td\\})")); transformers.add(new ReflowTransformer("(\\{li\\})([^\\{]*)(\\{li\\})")); transformers.add(new TagTransformer("<sup>(.*?)</sup>", true, "^\\[","\\]^ ")); transformers.add(new ReplaceTransformer("<","<")); transformers.add(new ReplaceTransformer(">",">")); transformers.add(new ReplaceTransformer(""","\"")); transformers.add(new ReplaceTransformer("&","&")); transformers.add(new ReplaceTransformer(":\\)",": )")); // No smilies... transformers.add(new ReplaceTransformer(";\\)","; )")); // No smilies... transformers.add(new ReplaceTransformer("(\\w)(--)(\\w)"," -- ",2)); // avoid strikeout transformers.add(new ReplaceTransformer("\\{table(.*?)\\}\\n\\s{2}\\{tr\\}\\n\\s{4}\\{td\\}Contents\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}","")); // remove "contents" table } /** * @param args the command line arguments */ public static void main(String[] args) throws IOException { if (args.length == 0) { System.out.println("Usage:"); System.out.println(" Html2Wiki {inputDirectory} [Category]"); System.out.println(" default is current directory"); System.out.println(" Processes all *.html files. "); System.out.println(" Each 'chapter' written to *.wiki"); return; } File inputs = new File(args[0]); if (args.length > 1) { category = args[1]; } File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); for (int i = 0; i < inputFiles.length; i++) { process(inputFiles[i]); } } protected static void process(File input) throws IOException { System.out.println(input.getAbsoluteFile()); Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null)); WikiChapter[] chapters = converter.getWikiChapters(); System.out.format("Writing %d wiki files...\n", chapters.length); for (int i = 0; i < chapters.length; i++) { FileUtils.writeStringToFile(new File(input.getParent(), generateFilename(chapters[i].getChapterName()) + ".wiki"), chapters[i].getContents().toString(), null); } } public static String generateFilename(String input) { return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", ""); } public String getWikiText() { convert(); return buffer.toString(); } public WikiChapter[] getWikiChapters() { convert(); List<WikiChapter> chapters = new ArrayList<WikiChapter>(); Pattern chapterPat = Pattern.compile("<chapter>"); Matcher begin = chapterPat.matcher(buffer); Matcher end = chapterPat.matcher(buffer); while (begin.find()) { end.find(begin.end()); Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start()); chapters.add(new WikiChapter(chapterName, contents)); } return (WikiChapter[]) chapters.toArray(new WikiChapter[]{}); } private void convert() { if (!converted) { for (Transformer t : transformers) { System.out.println(".Applying: " + t); t.apply(buffer); } } converted = true; } private static class HtmlFileFilter implements FileFilter { public boolean accept(File pathname) { return pathname.getName().toLowerCase().matches("^.*\\.html$"); } } protected static class WikiChapter { private String chapterName; private CharSequence contents; public WikiChapter(String chapterName, CharSequence contents) { this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&", "and"); this.contents = contents; } public String getChapterName() { return chapterName; } public CharSequence getContents() { return contents; } public String toString() { return "Chapter: " + chapterName + " Content length: " + contents.length(); } } }