Mercurial > public > html2wiki
diff src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 0:f8b1ea49d065
Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
author | smith@nwoca.org |
---|---|
date | Fri, 12 May 2006 16:45:42 -0400 |
parents | |
children | 5da2e67620f9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java Fri May 12 16:45:42 2006 -0400 @@ -0,0 +1,207 @@ +package org.nwoca.ssdt.tools.html2wiki; +/* + * Html2Wiki.java + * + * Created on May 9, 2006, 3:22 PM + * + */ + +import java.io.*; +import java.util.Collection; +import java.util.ArrayList; +import java.util.List; +import java.util.Iterator; +import org.apache.commons.io.FileSystemUtils; +import org.apache.commons.io.FileUtils; +import java.util.regex.*; +import org.apache.commons.io.FilenameUtils; + +/** + * Converter to convert HTML documents into MediaWiki test. + * + * Heavily customized to handle HTML produced by DEC DOCUMENT + * SOFTARE doctype. Breaks file into Chapters in the manner done + * by Document. Needs modification to work with other HTML files. + * + * @author SMITH + */ +public class Html2Wiki { + + private StringBuffer buffer; + private Collection<Transformer> transformers; + private boolean converted = false; + private static String category; + + /** Creates a new instance of Html2Wiki. */ + public Html2Wiki(String html) { + buffer = new StringBuffer(html); + transformers = new ArrayList<Transformer>(); + transformers.add(new PreTagTransformer()); + transformers.add(new DeleteTransformer("^\\s",true)); + transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); + transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); + transformers.add(new DeleteTransformer("<a .*?>|</a>")); + transformers.add(new DeleteTransformer("(?m)^\\*")); + transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); + transformers.add(new DeleteTransformer("<p>")); + transformers.add(new DeleteTransformer("(?m)<br>$")); + transformers.add(new DeleteTransformer("<font .*?>|</font>")); + transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","\n</li>")); + transformers.add(new BadTableDataTransformer()); + transformers.add(new BadTableRowTransformer()); + transformers.add(new ReplaceTransformer("</td>","\n</td>")); + transformers.add(new ChapterTransformer(category)); + transformers.add(new TagTransformer("<em>(.*?)</em>", "''")); + transformers.add(new TagTransformer("<strong>(.*?)</strong>", "'''")); + transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "<tt>", "</tt>")); + transformers.add(new TagTransformer("<h1>(.*)</h1>", "== ", " ==")); + transformers.add(new TagTransformer("<h2>(.*)</h2>", "=== ", " ===")); + transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","=== ", " ===")); + transformers.add(new TagTransformer("<h3>(.*)</H3>", "", "")); + transformers.add(new TagTransformer("<h3>(.*)</h3>", "==== ", " ====")); + transformers.add(new TagTransformer("<h4>(.*)</h4>", "===== ", " =====")); + transformers.add(new TagTransformer("<h5>(.*)</h5>", "====== ", " ======")); + transformers.add(new TagTransformer("<h6>(.*)</h6>", "======= ", " =======")); + transformers.add(new DeleteTransformer("(?s)<hr.*?>")); + + } + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws IOException { + + if (args.length == 0) { + System.out.println("Usage:"); + System.out.println(" Html2Wiki {inputDirectory} [Category]"); + System.out.println(" default is current directory"); + System.out.println(" Processes all *.html files. "); + System.out.println(" Each 'chapter' written to *.wiki"); + return; + } + + File inputs = new File(args[0]); + + if (args.length > 1) { + category = args[1]; + } + + File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); + for (int i = 0; i < inputFiles.length; i++) { + + process(inputFiles[i]); + + } + + } + + protected static void process(File input) throws IOException { + + System.out.println(input.getAbsoluteFile()); + + Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); + + + WikiChapter[] chapters = converter.getWikiChapters(); + + System.out.format("Writing %d wiki files...\n",chapters.length); + + StringBuffer wikiIndex = new StringBuffer(); + wikiIndex.append("Contents:\n\n"); + + for (int i = 0; i < chapters.length; i++) { + + wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); + FileUtils.writeStringToFile(new File(input.getParent(), + generateFilename(chapters[i].getChapterName())+".wiki"), + chapters[i].getContents().toString(), + null); + + } + System.out.println("Writing wikiIndex..."); + + FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); + } + + public static String generateFilename(String input) { + return input.replaceAll("\\\\|/|:|\\(|\\)","-"); + + } + public String getWikiText() { + convert(); + return buffer.toString(); + } + + public WikiChapter[] getWikiChapters() { + + convert(); + + List<WikiChapter> chapters = new ArrayList<WikiChapter>(); + + Pattern chapterPat = Pattern.compile("<chapter>"); + Matcher begin = chapterPat.matcher(buffer); + Matcher end = chapterPat.matcher(buffer); + + while(begin.find()) { + + + end.find(begin.end()); + + Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); + + Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); + + String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; + + CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() + ,end.hitEnd() ? buffer.length() : end.start()); + + chapters.add(new WikiChapter(chapterName,contents)); + + } + return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); + } + + private void convert() { + + if(!converted) { + for (Transformer t : transformers) { + + System.out.println(".Applying: " + t); + t.apply(buffer); + + } + } + converted = true; + } + + private static class HtmlFileFilter implements FileFilter { + public boolean accept(File pathname) { + return pathname.getName().toLowerCase().matches("^.*\\.html$"); + } + + } + private static class WikiChapter { + private String chapterName; + private CharSequence contents; + + public WikiChapter(String chapterName, CharSequence contents) { + this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); + + this.contents = contents; + } + + public String getChapterName() { + return chapterName; + } + + public CharSequence getContents() { + return contents; + } + + public String toString() { + return "Chapter: " + chapterName + "\nContents: " + contents; + } + } + +}