Mercurial > public > html2wiki
view src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 5:d34f4d408ef9
[no commit message]
author | ferrall@nwoca.org |
---|---|
date | Thu, 27 Jan 2011 14:32:34 -0500 |
parents | 22ed6d93442c |
children | 99f293bd507f |
line wrap: on
line source
package org.nwoca.ssdt.tools.html2wiki; /* * Html2Wiki.java * * Created on May 9, 2006, 3:22 PM * */ import java.io.*; import java.util.Collection; import java.util.ArrayList; import java.util.List; import java.util.Iterator; import org.apache.commons.io.FileSystemUtils; import org.apache.commons.io.FileUtils; import java.util.regex.*; import org.apache.commons.io.FilenameUtils; /** * Converter to convert HTML documents into MediaWiki test. * * Heavily customized to handle HTML produced by DEC DOCUMENT * SOFTARE doctype. Breaks file into Chapters in the manner done * by Document. Needs modification to work with other HTML files. * * @author SMITH */ public class Html2Wiki { private StringBuffer buffer; private Collection<Transformer> transformers; private boolean converted = false; private static String category; /** Creates a new instance of Html2Wiki. */ public Html2Wiki(String html) { buffer = new StringBuffer(html); transformers = new ArrayList<Transformer>(); // transformers.add(new PreTagTransformer()); // transformers.add(new DeleteTransformer("^\\s",true)); transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); transformers.add(new DeleteTransformer("<a .*?>|</a>")); transformers.add(new DeleteTransformer("(?m)^\\*")); // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); transformers.add(new DeleteTransformer("<p>")); transformers.add(new DeleteTransformer("(?m)<br>$")); transformers.add(new DeleteTransformer("<font .*?>|</font>")); transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); transformers.add(new BadTableDataTransformer()); transformers.add(new BadTableRowTransformer()); // transformers.add(new ReplaceTransformer("</td>","\n</td>")); transformers.add(new ReplaceTransformer("\\{","\\{")); transformers.add(new ReplaceTransformer("\\}","\\}")); // transformers.add(new ReplaceTransformer("\\[","\\[")); // transformers.add(new ReplaceTransformer("\\]","\\]")); transformers.add(new ReplaceTransformer("<br>","\\\\")); transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}")); transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}")); transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}")); transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}")); transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); transformers.add(new ReplaceTransformer("<li>","{li}")); transformers.add(new ReplaceTransformer("</li>","{li}\n")); transformers.add(new ChapterTransformer(category)); transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*")); transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*")); transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "{{", "}}")); transformers.add(new TagTransformer("<h1>(.*)</h1>", "h1. ", "")); transformers.add(new TagTransformer("<h2>(.*)</h2>", "h2. ", "")); transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","h3.", "")); transformers.add(new TagTransformer("<h3>(.*)</H3>", "h3. ", "")); transformers.add(new TagTransformer("<h3>(.*)</h3>", "h3. ", "")); transformers.add(new TagTransformer("<h4>(.*)</h4>", "h4. ", "")); transformers.add(new TagTransformer("<h5>(.*)</h5>", "h5. ", "")); transformers.add(new TagTransformer("<h6>(.*)</h6>", "h6. ", "")); transformers.add(new ReplaceTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{note}")); transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{note}")); // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}")); // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); transformers.add(new DeleteTransformer("(?s)<hr.*?>")); } /** * @param args the command line arguments */ public static void main(String[] args) throws IOException { if (args.length == 0) { System.out.println("Usage:"); System.out.println(" Html2Wiki {inputDirectory} [Category]"); System.out.println(" default is current directory"); System.out.println(" Processes all *.html files. "); System.out.println(" Each 'chapter' written to *.wiki"); return; } File inputs = new File(args[0]); if (args.length > 1) { category = args[1]; } File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); for (int i = 0; i < inputFiles.length; i++) { process(inputFiles[i]); } } protected static void process(File input) throws IOException { System.out.println(input.getAbsoluteFile()); Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); WikiChapter[] chapters = converter.getWikiChapters(); System.out.format("Writing %d wiki files...\n",chapters.length); StringBuffer wikiIndex = new StringBuffer(); wikiIndex.append("Contents:\n\n"); for (int i = 0; i < chapters.length; i++) { wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); FileUtils.writeStringToFile(new File(input.getParent(), generateFilename(chapters[i].getChapterName())+".wiki"), chapters[i].getContents().toString(), null); } System.out.println("Writing wikiIndex..."); FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); } public static String generateFilename(String input) { return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", ""); } public String getWikiText() { convert(); return buffer.toString(); } public WikiChapter[] getWikiChapters() { convert(); List<WikiChapter> chapters = new ArrayList<WikiChapter>(); Pattern chapterPat = Pattern.compile("<chapter>"); Matcher begin = chapterPat.matcher(buffer); Matcher end = chapterPat.matcher(buffer); while(begin.find()) { end.find(begin.end()); Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() ,end.hitEnd() ? buffer.length() : end.start()); chapters.add(new WikiChapter(chapterName,contents)); } return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); } private void convert() { if(!converted) { for (Transformer t : transformers) { System.out.println(".Applying: " + t); t.apply(buffer); } } converted = true; } private static class HtmlFileFilter implements FileFilter { public boolean accept(File pathname) { return pathname.getName().toLowerCase().matches("^.*\\.html$"); } } protected static class WikiChapter { private String chapterName; private CharSequence contents; public WikiChapter(String chapterName, CharSequence contents) { this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); this.contents = contents; } public String getChapterName() { return chapterName; } public CharSequence getContents() { return contents; } public String toString() { return "Chapter: " + chapterName + " Content length: " + contents.length(); } } }