changeset 6:99f293bd507f

Add "reflow" transformer to reflow paragraphs, list items, etc.
author smith@nwoca.org
date Thu, 27 Jan 2011 16:37:27 -0500 (2011-01-27)
parents d34f4d408ef9
children a634b4d554d4
files src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java
diffstat 2 files changed, 118 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java	Thu Jan 27 14:32:34 2011 -0500
+++ b/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java	Thu Jan 27 16:37:27 2011 -0500
@@ -10,8 +10,6 @@
 import java.util.Collection;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Iterator;
-import org.apache.commons.io.FileSystemUtils;
 import org.apache.commons.io.FileUtils;
 import java.util.regex.*;
 import org.apache.commons.io.FilenameUtils;
@@ -26,12 +24,12 @@
  * @author SMITH
  */
 public class Html2Wiki {
-    
+
     private StringBuffer buffer;
     private Collection<Transformer> transformers;
     private boolean converted = false;
     private static String category;
-    
+
     /** Creates a new instance of Html2Wiki. */
     public Html2Wiki(String html) {
         buffer = new StringBuffer(html);
@@ -43,12 +41,13 @@
         transformers.add(new DeleteTransformer("<a .*?>|</a>"));
         transformers.add(new DeleteTransformer("(?m)^\\*"));
 //        transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
-        transformers.add(new DeleteTransformer("<p>"));
         transformers.add(new DeleteTransformer("(?m)<br>$"));
         transformers.add(new DeleteTransformer("<font .*?>|</font>"));
         transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>"));
         transformers.add(new BadTableDataTransformer());
         transformers.add(new BadTableRowTransformer());
+        transformers.add(new ReflowTransformer());
+        transformers.add(new DeleteTransformer("<p>"));
 //        transformers.add(new ReplaceTransformer("</td>","\n</td>"));
           transformers.add(new ReplaceTransformer("\\{","\\{"));
           transformers.add(new ReplaceTransformer("\\}","\\}"));
@@ -63,7 +62,7 @@
         transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}"));
         transformers.add(new ReplaceTransformer("<li>","{li}"));
         transformers.add(new ReplaceTransformer("</li>","{li}\n"));
-        
+
         transformers.add(new ChapterTransformer(category));
         transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}"));
         transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}"));
@@ -85,14 +84,15 @@
 //        transformers.add(new TagTransformer("(\\S)\\s\\n",    "", " "));
         transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>",    "{quote}", "{quote}"));
         transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
-     
+        transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})"));
+
     }
-    
+
     /**
      * @param args the command line arguments
      */
     public static void main(String[] args) throws IOException {
-        
+
         if (args.length == 0) {
             System.out.println("Usage:");
             System.out.println("  Html2Wiki {inputDirectory} [Category]");
@@ -101,128 +101,124 @@
             System.out.println("      Each 'chapter' written to *.wiki");
             return;
         }
-        
+
         File inputs = new File(args[0]);
-        
+
         if (args.length > 1) {
             category = args[1];
         }
-        
+
         File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
         for (int i = 0; i < inputFiles.length; i++) {
-            
+
             process(inputFiles[i]);
-            
+
         }
-        
+
     }
-    
+
     protected static void process(File input) throws IOException {
-        
+
         System.out.println(input.getAbsoluteFile());
-        
-        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null));
-                
+
+        Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null));
+
         WikiChapter[] chapters = converter.getWikiChapters();
-        
-        System.out.format("Writing %d wiki files...\n",chapters.length);
+
+        System.out.format("Writing %d wiki files...\n", chapters.length);
 
-        StringBuffer wikiIndex = new StringBuffer();
-        wikiIndex.append("Contents:\n\n");
-        
+ 
         for (int i = 0; i < chapters.length; i++) {
-            
-            wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
+
             FileUtils.writeStringToFile(new File(input.getParent(),
-                                        generateFilename(chapters[i].getChapterName())+".wiki"),
-                            chapters[i].getContents().toString(),
-                            null);
-            
+                    generateFilename(chapters[i].getChapterName()) + ".wiki"),
+                    chapters[i].getContents().toString(),
+                    null);
+
         }
-        System.out.println("Writing wikiIndex...");
-        
-        FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null);
+
     }
-    
+
     public static String generateFilename(String input) {
-        return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", "");
-        
+        return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", "");
+
     }
+
     public String getWikiText() {
         convert();
         return buffer.toString();
     }
-    
+
     public WikiChapter[] getWikiChapters() {
-        
+
         convert();
-        
+
         List<WikiChapter> chapters = new ArrayList<WikiChapter>();
-        
+
         Pattern chapterPat = Pattern.compile("<chapter>");
         Matcher begin = chapterPat.matcher(buffer);
         Matcher end = chapterPat.matcher(buffer);
-        
-        while(begin.find()) {
-            
-            
+
+        while (begin.find()) {
+
+
             end.find(begin.end());
-            
+
             Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
-            
+
             Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
-            
+
             String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
-            
-            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end()
-            ,end.hitEnd() ? buffer.length() : end.start());
-            
-            chapters.add(new WikiChapter(chapterName,contents));
-            
+
+            CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start());
+
+            chapters.add(new WikiChapter(chapterName, contents));
+
         }
-        return (WikiChapter[])chapters.toArray(new WikiChapter[]{});
+        return (WikiChapter[]) chapters.toArray(new WikiChapter[]{});
     }
-    
+
     private void convert() {
-        
-        if(!converted) {
+
+        if (!converted) {
             for (Transformer t : transformers) {
-                
+
                 System.out.println(".Applying: " + t);
                 t.apply(buffer);
-                
+
             }
         }
         converted = true;
     }
-    
+
     private static class HtmlFileFilter implements FileFilter {
+
         public boolean accept(File pathname) {
             return pathname.getName().toLowerCase().matches("^.*\\.html$");
         }
-        
     }
+
     protected static class WikiChapter {
+
         private String chapterName;
         private CharSequence contents;
-        
+
         public WikiChapter(String chapterName, CharSequence contents) {
-            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and");
-            
+            this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&amp;", "and");
+
             this.contents = contents;
         }
-        
+
         public String getChapterName() {
             return chapterName;
         }
-        
+
         public CharSequence getContents() {
             return contents;
         }
-        
+
         public String toString() {
             return "Chapter: " + chapterName + " Content length: " + contents.length();
         }
     }
-    
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java	Thu Jan 27 16:37:27 2011 -0500
@@ -0,0 +1,53 @@
+package org.nwoca.ssdt.tools.html2wiki;
+
+import java.util.regex.*;
+
+class ReflowTransformer implements Transformer {
+
+    private Pattern[] patterns = {
+        Pattern.compile("(\\n<p>)(.*?)(\\n\\n|\\n<)", Pattern.MULTILINE + Pattern.DOTALL),
+        Pattern.compile("(<li>)(.*?)(</li>)", Pattern.MULTILINE + Pattern.DOTALL),
+        Pattern.compile("(<td>)([^<]*)(</td>)", Pattern.MULTILINE + Pattern.DOTALL)
+    };
+
+    /**
+     * Default transformer refolows paragraphs, li's and td's.
+     *
+     */
+    public ReflowTransformer() {
+    }
+
+    /**
+     * Create transformer with specific regexp.
+     *
+     * Regexp must provide three groups: (before)(textToReflow)(after).
+     *
+     * @param regexp
+     */
+    public ReflowTransformer(String regexp) {
+        patterns = new Pattern[]{
+                    Pattern.compile(regexp, Pattern.MULTILINE + Pattern.DOTALL)
+                };
+    }
+
+    public void apply(StringBuffer buffer) {
+        for (Pattern pattern : patterns) {
+
+            System.out.println("  Reflowing: " +  pattern);
+            Matcher matcher = pattern.matcher(buffer);
+
+            int start = 0;
+            while (matcher.find(start)) {
+                String temp = matcher.group(2);
+                temp = temp.replaceAll(" \\n", " ");
+                buffer.replace(matcher.start(), matcher.end(), matcher.group(1) + temp + matcher.group(3));
+                start = matcher.start() + matcher.group(1).length() + temp.length() + matcher.group(3).length() - 1;
+            }
+        }
+
+    }
+
+    public String toString() {
+        return "Reflowing block tags";
+    }
+}