diff src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 0:f8b1ea49d065

Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
author smith@nwoca.org
date Fri, 12 May 2006 16:45:42 -0400
parents
children 5da2e67620f9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java	Fri May 12 16:45:42 2006 -0400
@@ -0,0 +1,207 @@
+package org.nwoca.ssdt.tools.html2wiki;
+/*
+ * Html2Wiki.java
+ *
+ * Created on May 9, 2006, 3:22 PM
+ *
+ */
+
+import java.io.*;
+import java.util.Collection;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Iterator;
+import org.apache.commons.io.FileSystemUtils;
+import org.apache.commons.io.FileUtils;
+import java.util.regex.*;
+import org.apache.commons.io.FilenameUtils;
+
+/**
+ * Converter to convert HTML documents into MediaWiki test.
+ *
+ * Heavily customized to handle HTML produced by DEC DOCUMENT
+ * SOFTARE doctype.  Breaks file into Chapters in the manner done
+ * by Document.  Needs modification to work with other HTML files.
+ *
+ * @author SMITH
+ */
+public class Html2Wiki {
+    
+    private StringBuffer buffer;
+    private Collection<Transformer> transformers;
+    private boolean converted = false;
+    private static String category;
+    
+    /** Creates a new instance of Html2Wiki. */
+    public Html2Wiki(String html) {
+        buffer = new StringBuffer(html);
+        transformers =  new ArrayList<Transformer>();
+        transformers.add(new PreTagTransformer());
+        transformers.add(new DeleteTransformer("^\\s",true));
+        transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
+        transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
+        transformers.add(new DeleteTransformer("<a .*?>|</a>"));
+        transformers.add(new DeleteTransformer("(?m)^\\*"));
+        transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
+        transformers.add(new DeleteTransformer("<p>"));
+        transformers.add(new DeleteTransformer("(?m)<br>$"));
+        transformers.add(new DeleteTransformer("<font .*?>|</font>"));
+        transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","\n</li>"));
+        transformers.add(new BadTableDataTransformer());
+        transformers.add(new BadTableRowTransformer());
+        transformers.add(new ReplaceTransformer("</td>","\n</td>"));
+        transformers.add(new ChapterTransformer(category));
+        transformers.add(new TagTransformer("<em>(.*?)</em>",    "''"));
+        transformers.add(new TagTransformer("<strong>(.*?)</strong>",    "'''"));
+        transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>",  "<tt>", "</tt>"));
+        transformers.add(new TagTransformer("<h1>(.*)</h1>",    "== ",      " =="));
+        transformers.add(new TagTransformer("<h2>(.*)</h2>",    "=== ",     " ==="));
+        transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","=== ",    " ==="));
+        transformers.add(new TagTransformer("<h3>(.*)</H3>",    "",         ""));
+        transformers.add(new TagTransformer("<h3>(.*)</h3>",    "==== ",        " ===="));
+        transformers.add(new TagTransformer("<h4>(.*)</h4>",    "===== ",   " ====="));
+        transformers.add(new TagTransformer("<h5>(.*)</h5>",    "====== ",  " ======"));
+        transformers.add(new TagTransformer("<h6>(.*)</h6>",    "======= ", " ======="));
+        transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
+     
+    }
+    
+    /**
+     * @param args the command line arguments
+     */
+    public static void main(String[] args) throws IOException {
+        
+        if (args.length == 0) {
+            System.out.println("Usage:");
+            System.out.println("  Html2Wiki {inputDirectory} [Category]");
+            System.out.println("      default is current directory");
+            System.out.println("      Processes all *.html files. ");
+            System.out.println("      Each 'chapter' written to *.wiki");
+            return;
+        }
+        
+        File inputs = new File(args[0]);
+        
+        if (args.length > 1) {
+            category = args[1];
+        }
+        
+        File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
+        for (int i = 0; i < inputFiles.length; i++) {
+            
+            process(inputFiles[i]);
+            
+        }
+        
+    }
+    
+    protected static void process(File input) throws IOException {
+        
+        System.out.println(input.getAbsoluteFile());
+        
+        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null));
+        
+        
+        WikiChapter[] chapters = converter.getWikiChapters();
+        
+        System.out.format("Writing %d wiki files...\n",chapters.length);
+
+        StringBuffer wikiIndex = new StringBuffer();
+        wikiIndex.append("Contents:\n\n");
+        
+        for (int i = 0; i < chapters.length; i++) {
+            
+            wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
+            FileUtils.writeStringToFile(new File(input.getParent(),
+                                        generateFilename(chapters[i].getChapterName())+".wiki"),
+                            chapters[i].getContents().toString(),
+                            null);
+            
+        }
+        System.out.println("Writing wikiIndex...");
+        
+        FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null);
+    }
+    
+    public static String generateFilename(String input) {
+        return input.replaceAll("\\\\|/|:|\\(|\\)","-");
+        
+    }
+    public String getWikiText() {
+        convert();
+        return buffer.toString();
+    }
+    
+    public WikiChapter[] getWikiChapters() {
+        
+        convert();
+        
+        List<WikiChapter> chapters = new ArrayList<WikiChapter>();
+        
+        Pattern chapterPat = Pattern.compile("<chapter>");
+        Matcher begin = chapterPat.matcher(buffer);
+        Matcher end = chapterPat.matcher(buffer);
+        
+        while(begin.find()) {
+            
+            
+            end.find(begin.end());
+            
+            Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
+            
+            Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
+            
+            String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
+            
+            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end()
+            ,end.hitEnd() ? buffer.length() : end.start());
+            
+            chapters.add(new WikiChapter(chapterName,contents));
+            
+        }
+        return (WikiChapter[])chapters.toArray(new WikiChapter[]{});
+    }
+    
+    private void convert() {
+        
+        if(!converted) {
+            for (Transformer t : transformers) {
+                
+                System.out.println(".Applying: " + t);
+                t.apply(buffer);
+                
+            }
+        }
+        converted = true;
+    }
+    
+    private static class HtmlFileFilter implements FileFilter {
+        public boolean accept(File pathname) {
+            return pathname.getName().toLowerCase().matches("^.*\\.html$");
+        }
+        
+    }
+    private static class WikiChapter {
+        private String chapterName;
+        private CharSequence contents;
+        
+        public WikiChapter(String chapterName, CharSequence contents) {
+            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and");
+            
+            this.contents = contents;
+        }
+        
+        public String getChapterName() {
+            return chapterName;
+        }
+        
+        public CharSequence getContents() {
+            return contents;
+        }
+        
+        public String toString() {
+            return "Chapter: " + chapterName + "\nContents: " + contents;
+        }
+    }
+    
+}