view src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 9:ccb40d1cb213

[no commit message]
author ferrall@nwoca.org
date Fri, 28 Jan 2011 09:07:05 -0500
parents e8ea26ab2cd7
children 2fb5084b1564
line wrap: on
line source
package org.nwoca.ssdt.tools.html2wiki;
/*
 * Html2Wiki.java
 *
 * Created on May 9, 2006, 3:22 PM
 *
 */

import java.io.*;
import java.util.Collection;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import java.util.regex.*;

/**
 * Converter to convert HTML documents into MediaWiki test.
 *
 * Heavily customized to handle HTML produced by DEC DOCUMENT
 * SOFTARE doctype.  Breaks file into Chapters in the manner done
 * by Document.  Needs modification to work with other HTML files.
 *
 * @author SMITH
 */
public class Html2Wiki {

    private StringBuffer buffer;
    private Collection<Transformer> transformers;
    private boolean converted = false;
    private static String category;

    /** Creates a new instance of Html2Wiki. */
    public Html2Wiki(String html) {
        buffer = new StringBuffer(html);
        transformers =  new ArrayList<Transformer>();
        transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
        transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
        transformers.add(new DeleteTransformer("<a .*?>|</a>"));
        transformers.add(new DeleteTransformer("(?m)^\\*"));
        transformers.add(new DeleteTransformer("(?m)<br>$"));
        transformers.add(new DeleteTransformer("<font .*?>|</font>"));
        transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>"));
        transformers.add(new BadTableDataTransformer());
        transformers.add(new BadTableRowTransformer());
        transformers.add(new ReflowTransformer());
        transformers.add(new DeleteTransformer("<p>"));
        transformers.add(new ReplaceTransformer("\\{","\\{"));  // Escape braces
        transformers.add(new ReplaceTransformer("\\}","\\}"));

        transformers.add(new ReplaceTransformer("\\[","\\["));  // Escape brackets
        transformers.add(new ReplaceTransformer("\\]","\\]"));
        transformers.add(new PreTagTransformer());              // Unescape brackets inside <pre>
//
        transformers.add(new ReplaceTransformer("<br>","\\\\"));

        //replace table tag preserving border setting.
        transformers.add(new TagTransformer("<table\\sborder=(\\d).*?>", true, "{table:border=","}"));

        transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}"));
        transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}"));
        transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}"));
        transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}"));
        transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}"));
        transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}"));
        transformers.add(new ReplaceTransformer("<li>","{li}"));
        transformers.add(new ReplaceTransformer("</li>","{li}\n"));

        transformers.add(new ChapterTransformer(category));
        transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}"));
        transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}"));
        transformers.add(new TagTransformer("<em>(.*?)</em>",    "*","*"));
        transformers.add(new TagTransformer("<strong>(.*?)</strong>",    "*","*"));
        transformers.add(new TagTransformer("<u>(.*?)</u>" , "+","+"));
        transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>",  "{{", "}}"));
        transformers.add(new TagTransformer("<h1>(.*)</h1>",    "h1. ",      ""));
        transformers.add(new TagTransformer("<h2>(.*)</h2>",    "h2. ",     ""));
        transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","h3.",    ""));
        transformers.add(new TagTransformer("<h3>(.*)</H3>",    "h3. ",         ""));
        transformers.add(new TagTransformer("<h3>(.*)</h3>",    "h3. ",        ""));
        transformers.add(new TagTransformer("<h4>(.*)</h4>",    "h4. ",   ""));
        transformers.add(new TagTransformer("<h5>(.*)</h5>",    "h5. ",  ""));
        transformers.add(new TagTransformer("<h6>(.*)</h6>",    "h6. ", ""));

        //Replace Notes with Info tags.
        transformers.add(new ReplaceTransformer("\\{center}\\n\\{table:border=\\d}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{info}"));
        transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{info}"));

        //Remove unnecessary table surrounding code blocks.
        transformers.add(new ReplaceTransformer("\\{table:.*\\}\\n\\s{2}\\{tr\\}\\n\\s{4}\\{td\\}\\n\\s{6}\\n\\{code\\}","{code}"));
        transformers.add(new ReplaceTransformer("\\{code\\}\\n\\{td\\}\\{tr\\}\\{table\\}","{code}"));

        //Change borderStyle of code window for "screenshots" to none.
        transformers.add(new TagTransformer("\\{code\\}([\\s\\n]*?_______________)", true, "{code:borderStyle=none}", ""));



        transformers.add(new TagTransformer("<blockquote>(.*?)</blockquote>", true,   "{quote}", "{quote}"));
        transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
        transformers.add(new ReflowTransformer("(\\{info\\})([^\\{]*)(\\{info\\})"));
        transformers.add(new TagTransformer("<sup>(.*?)</sup>", true, "^\\[","\\]^ "));
        transformers.add(new ReplaceTransformer("&lt;","<"));
        transformers.add(new ReplaceTransformer("&gt;",">"));
        transformers.add(new ReplaceTransformer("&quot;","\""));
        transformers.add(new ReplaceTransformer(":\\)",": )"));  // No smilies...

    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException {

        if (args.length == 0) {
            System.out.println("Usage:");
            System.out.println("  Html2Wiki {inputDirectory} [Category]");
            System.out.println("      default is current directory");
            System.out.println("      Processes all *.html files. ");
            System.out.println("      Each 'chapter' written to *.wiki");
            return;
        }

        File inputs = new File(args[0]);

        if (args.length > 1) {
            category = args[1];
        }

        File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
        for (int i = 0; i < inputFiles.length; i++) {

            process(inputFiles[i]);

        }

    }

    protected static void process(File input) throws IOException {

        System.out.println(input.getAbsoluteFile());

        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null));

        WikiChapter[] chapters = converter.getWikiChapters();

        System.out.format("Writing %d wiki files...\n", chapters.length);

 
        for (int i = 0; i < chapters.length; i++) {

            FileUtils.writeStringToFile(new File(input.getParent(),
                    generateFilename(chapters[i].getChapterName()) + ".wiki"),
                    chapters[i].getContents().toString(),
                    null);

        }

    }

    public static String generateFilename(String input) {
        return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", "");

    }

    public String getWikiText() {
        convert();
        return buffer.toString();
    }

    public WikiChapter[] getWikiChapters() {

        convert();

        List<WikiChapter> chapters = new ArrayList<WikiChapter>();

        Pattern chapterPat = Pattern.compile("<chapter>");
        Matcher begin = chapterPat.matcher(buffer);
        Matcher end = chapterPat.matcher(buffer);

        while (begin.find()) {


            end.find(begin.end());

            Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");

            Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);

            String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;

            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start());

            chapters.add(new WikiChapter(chapterName, contents));

        }
        return (WikiChapter[]) chapters.toArray(new WikiChapter[]{});
    }

    private void convert() {

        if (!converted) {
            for (Transformer t : transformers) {

                System.out.println(".Applying: " + t);
                t.apply(buffer);

            }
        }
        converted = true;
    }

    private static class HtmlFileFilter implements FileFilter {

        public boolean accept(File pathname) {
            return pathname.getName().toLowerCase().matches("^.*\\.html$");
        }
    }

    protected static class WikiChapter {

        private String chapterName;
        private CharSequence contents;

        public WikiChapter(String chapterName, CharSequence contents) {
            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&amp;", "and");

            this.contents = contents;
        }

        public String getChapterName() {
            return chapterName;
        }

        public CharSequence getContents() {
            return contents;
        }

        public String toString() {
            return "Chapter: " + chapterName + " Content length: " + contents.length();
        }
    }
}