view src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 5:d34f4d408ef9

[no commit message]
author ferrall@nwoca.org
date Thu, 27 Jan 2011 14:32:34 -0500
parents 22ed6d93442c
children 99f293bd507f
line wrap: on
line source
package org.nwoca.ssdt.tools.html2wiki;
/*
 * Html2Wiki.java
 *
 * Created on May 9, 2006, 3:22 PM
 *
 */

import java.io.*;
import java.util.Collection;
import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
import org.apache.commons.io.FileSystemUtils;
import org.apache.commons.io.FileUtils;
import java.util.regex.*;
import org.apache.commons.io.FilenameUtils;

/**
 * Converter to convert HTML documents into MediaWiki test.
 *
 * Heavily customized to handle HTML produced by DEC DOCUMENT
 * SOFTARE doctype.  Breaks file into Chapters in the manner done
 * by Document.  Needs modification to work with other HTML files.
 *
 * @author SMITH
 */
public class Html2Wiki {
    
    private StringBuffer buffer;
    private Collection<Transformer> transformers;
    private boolean converted = false;
    private static String category;
    
    /** Creates a new instance of Html2Wiki. */
    public Html2Wiki(String html) {
        buffer = new StringBuffer(html);
        transformers =  new ArrayList<Transformer>();
//        transformers.add(new PreTagTransformer());
//        transformers.add(new DeleteTransformer("^\\s",true));
        transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
        transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
        transformers.add(new DeleteTransformer("<a .*?>|</a>"));
        transformers.add(new DeleteTransformer("(?m)^\\*"));
//        transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
        transformers.add(new DeleteTransformer("<p>"));
        transformers.add(new DeleteTransformer("(?m)<br>$"));
        transformers.add(new DeleteTransformer("<font .*?>|</font>"));
        transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>"));
        transformers.add(new BadTableDataTransformer());
        transformers.add(new BadTableRowTransformer());
//        transformers.add(new ReplaceTransformer("</td>","\n</td>"));
          transformers.add(new ReplaceTransformer("\\{","\\{"));
          transformers.add(new ReplaceTransformer("\\}","\\}"));
//          transformers.add(new ReplaceTransformer("\\[","\\["));
//          transformers.add(new ReplaceTransformer("\\]","\\]"));
        transformers.add(new ReplaceTransformer("<br>","\\\\"));
        transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}"));
        transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}"));
        transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}"));
        transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}"));
        transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}"));
        transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}"));
        transformers.add(new ReplaceTransformer("<li>","{li}"));
        transformers.add(new ReplaceTransformer("</li>","{li}\n"));
        
        transformers.add(new ChapterTransformer(category));
        transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}"));
        transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}"));
        transformers.add(new TagTransformer("<em>(.*?)</em>",    "*","*"));
        transformers.add(new TagTransformer("<strong>(.*?)</strong>",    "*","*"));
        transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>",  "{{", "}}"));
        transformers.add(new TagTransformer("<h1>(.*)</h1>",    "h1. ",      ""));
        transformers.add(new TagTransformer("<h2>(.*)</h2>",    "h2. ",     ""));
        transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","h3.",    ""));
        transformers.add(new TagTransformer("<h3>(.*)</H3>",    "h3. ",         ""));
        transformers.add(new TagTransformer("<h3>(.*)</h3>",    "h3. ",        ""));
        transformers.add(new TagTransformer("<h4>(.*)</h4>",    "h4. ",   ""));
        transformers.add(new TagTransformer("<h5>(.*)</h5>",    "h5. ",  ""));
        transformers.add(new TagTransformer("<h6>(.*)</h6>",    "h6. ", ""));
        transformers.add(new ReplaceTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{note}"));
        transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{note}"));

//        transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}",    "{note}",  "{note}"));
//        transformers.add(new TagTransformer("(\\S)\\s\\n",    "", " "));
        transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>",    "{quote}", "{quote}"));
        transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
     
    }
    
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException {
        
        if (args.length == 0) {
            System.out.println("Usage:");
            System.out.println("  Html2Wiki {inputDirectory} [Category]");
            System.out.println("      default is current directory");
            System.out.println("      Processes all *.html files. ");
            System.out.println("      Each 'chapter' written to *.wiki");
            return;
        }
        
        File inputs = new File(args[0]);
        
        if (args.length > 1) {
            category = args[1];
        }
        
        File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
        for (int i = 0; i < inputFiles.length; i++) {
            
            process(inputFiles[i]);
            
        }
        
    }
    
    protected static void process(File input) throws IOException {
        
        System.out.println(input.getAbsoluteFile());
        
        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null));
                
        WikiChapter[] chapters = converter.getWikiChapters();
        
        System.out.format("Writing %d wiki files...\n",chapters.length);

        StringBuffer wikiIndex = new StringBuffer();
        wikiIndex.append("Contents:\n\n");
        
        for (int i = 0; i < chapters.length; i++) {
            
            wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
            FileUtils.writeStringToFile(new File(input.getParent(),
                                        generateFilename(chapters[i].getChapterName())+".wiki"),
                            chapters[i].getContents().toString(),
                            null);
            
        }
        System.out.println("Writing wikiIndex...");
        
        FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null);
    }
    
    public static String generateFilename(String input) {
        return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", "");
        
    }
    public String getWikiText() {
        convert();
        return buffer.toString();
    }
    
    public WikiChapter[] getWikiChapters() {
        
        convert();
        
        List<WikiChapter> chapters = new ArrayList<WikiChapter>();
        
        Pattern chapterPat = Pattern.compile("<chapter>");
        Matcher begin = chapterPat.matcher(buffer);
        Matcher end = chapterPat.matcher(buffer);
        
        while(begin.find()) {
            
            
            end.find(begin.end());
            
            Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
            
            Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
            
            String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
            
            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end()
            ,end.hitEnd() ? buffer.length() : end.start());
            
            chapters.add(new WikiChapter(chapterName,contents));
            
        }
        return (WikiChapter[])chapters.toArray(new WikiChapter[]{});
    }
    
    private void convert() {
        
        if(!converted) {
            for (Transformer t : transformers) {
                
                System.out.println(".Applying: " + t);
                t.apply(buffer);
                
            }
        }
        converted = true;
    }
    
    private static class HtmlFileFilter implements FileFilter {
        public boolean accept(File pathname) {
            return pathname.getName().toLowerCase().matches("^.*\\.html$");
        }
        
    }
    protected static class WikiChapter {
        private String chapterName;
        private CharSequence contents;
        
        public WikiChapter(String chapterName, CharSequence contents) {
            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and");
            
            this.contents = contents;
        }
        
        public String getChapterName() {
            return chapterName;
        }
        
        public CharSequence getContents() {
            return contents;
        }
        
        public String toString() {
            return "Chapter: " + chapterName + " Content length: " + contents.length();
        }
    }
    
}