view src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 2:5da2e67620f9

Upgrade to Ivy configuration and begin clean up of tests. Added FreeBSD license.
author smith@nwoca.org
date Tue, 25 Jan 2011 17:06:57 -0500
parents f8b1ea49d065
children 22ed6d93442c
line wrap: on
line source
package org.nwoca.ssdt.tools.html2wiki;
/*
 * Html2Wiki.java
 *
 * Created on May 9, 2006, 3:22 PM
 *
 */

import java.io.*;
import java.util.Collection;
import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
import org.apache.commons.io.FileSystemUtils;
import org.apache.commons.io.FileUtils;
import java.util.regex.*;
import org.apache.commons.io.FilenameUtils;

/**
 * Converter to convert HTML documents into MediaWiki test.
 *
 * Heavily customized to handle HTML produced by DEC DOCUMENT
 * SOFTARE doctype.  Breaks file into Chapters in the manner done
 * by Document.  Needs modification to work with other HTML files.
 *
 * @author SMITH
 */
public class Html2Wiki {
    
    private StringBuffer buffer;
    private Collection<Transformer> transformers;
    private boolean converted = false;
    private static String category;
    
    /** Creates a new instance of Html2Wiki. */
    public Html2Wiki(String html) {
        buffer = new StringBuffer(html);
        transformers =  new ArrayList<Transformer>();
        transformers.add(new PreTagTransformer());
        transformers.add(new DeleteTransformer("^\\s",true));
        transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
        transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
        transformers.add(new DeleteTransformer("<a .*?>|</a>"));
        transformers.add(new DeleteTransformer("(?m)^\\*"));
        transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
        transformers.add(new DeleteTransformer("<p>"));
        transformers.add(new DeleteTransformer("(?m)<br>$"));
        transformers.add(new DeleteTransformer("<font .*?>|</font>"));
        transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","\n</li>"));
        transformers.add(new BadTableDataTransformer());
        transformers.add(new BadTableRowTransformer());
        transformers.add(new ReplaceTransformer("</td>","\n</td>"));
        transformers.add(new ChapterTransformer(category));
        transformers.add(new TagTransformer("<em>(.*?)</em>",    "''"));
        transformers.add(new TagTransformer("<strong>(.*?)</strong>",    "'''"));
        transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>",  "<tt>", "</tt>"));
        transformers.add(new TagTransformer("<h1>(.*)</h1>",    "== ",      " =="));
        transformers.add(new TagTransformer("<h2>(.*)</h2>",    "=== ",     " ==="));
        transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","=== ",    " ==="));
        transformers.add(new TagTransformer("<h3>(.*)</H3>",    "",         ""));
        transformers.add(new TagTransformer("<h3>(.*)</h3>",    "==== ",        " ===="));
        transformers.add(new TagTransformer("<h4>(.*)</h4>",    "===== ",   " ====="));
        transformers.add(new TagTransformer("<h5>(.*)</h5>",    "====== ",  " ======"));
        transformers.add(new TagTransformer("<h6>(.*)</h6>",    "======= ", " ======="));
        transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
     
    }
    
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException {
        
        if (args.length == 0) {
            System.out.println("Usage:");
            System.out.println("  Html2Wiki {inputDirectory} [Category]");
            System.out.println("      default is current directory");
            System.out.println("      Processes all *.html files. ");
            System.out.println("      Each 'chapter' written to *.wiki");
            return;
        }
        
        File inputs = new File(args[0]);
        
        if (args.length > 1) {
            category = args[1];
        }
        
        File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
        for (int i = 0; i < inputFiles.length; i++) {
            
            process(inputFiles[i]);
            
        }
        
    }
    
    protected static void process(File input) throws IOException {
        
        System.out.println(input.getAbsoluteFile());
        
        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null));
                
        WikiChapter[] chapters = converter.getWikiChapters();
        
        System.out.format("Writing %d wiki files...\n",chapters.length);

        StringBuffer wikiIndex = new StringBuffer();
        wikiIndex.append("Contents:\n\n");
        
        for (int i = 0; i < chapters.length; i++) {
            
            wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
            FileUtils.writeStringToFile(new File(input.getParent(),
                                        generateFilename(chapters[i].getChapterName())+".wiki"),
                            chapters[i].getContents().toString(),
                            null);
            
        }
        System.out.println("Writing wikiIndex...");
        
        FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null);
    }
    
    public static String generateFilename(String input) {
        return input.replaceAll("\\\\|/|:|\\(|\\)","-");
        
    }
    public String getWikiText() {
        convert();
        return buffer.toString();
    }
    
    public WikiChapter[] getWikiChapters() {
        
        convert();
        
        List<WikiChapter> chapters = new ArrayList<WikiChapter>();
        
        Pattern chapterPat = Pattern.compile("<chapter>");
        Matcher begin = chapterPat.matcher(buffer);
        Matcher end = chapterPat.matcher(buffer);
        
        while(begin.find()) {
            
            
            end.find(begin.end());
            
            Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
            
            Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
            
            String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
            
            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end()
            ,end.hitEnd() ? buffer.length() : end.start());
            
            chapters.add(new WikiChapter(chapterName,contents));
            
        }
        return (WikiChapter[])chapters.toArray(new WikiChapter[]{});
    }
    
    private void convert() {
        
        if(!converted) {
            for (Transformer t : transformers) {
                
                System.out.println(".Applying: " + t);
                t.apply(buffer);
                
            }
        }
        converted = true;
    }
    
    private static class HtmlFileFilter implements FileFilter {
        public boolean accept(File pathname) {
            return pathname.getName().toLowerCase().matches("^.*\\.html$");
        }
        
    }
    protected static class WikiChapter {
        private String chapterName;
        private CharSequence contents;
        
        public WikiChapter(String chapterName, CharSequence contents) {
            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and");
            
            this.contents = contents;
        }
        
        public String getChapterName() {
            return chapterName;
        }
        
        public CharSequence getContents() {
            return contents;
        }
        
        public String toString() {
            return "Chapter: " + chapterName + " Content length: " + contents.length();
        }
    }
    
}