Java HTML Table parser

Posted on September 28, 2006
Tags: java

I put together this class for scraping HTML tables. It supports nested tables. At some stage I’ll put this up in SVN along with it’s test cases.

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Stack;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;
import org.apache.log4j.Logger;

public class TableParser extends ArrayList {

    static Logger logger = Logger.getLogger(TableParser.class);

    Stack s = new Stack();
    /**     * Process this reader.
     * @param f
     * @throws IOException
     */
    public void parse(Reader f) throws IOException {
        this.clear();
        new ParserDelegator().parse(f, parser, false);
    }

    private HTMLEditorKit.ParserCallback parser = new HTMLEditorKit.ParserCallback() {

        private boolean inTD;
    
        private String tdBuffer;
    
        public void handleError(String arg0, int arg1) {
    //            System.out.println("Error "+arg0);
             // TODO Auto-generated method stub
            super.handleError(arg0, arg1);
        }
    
        public void handleText(char[] arg0, int arg1) {
            if (inTD){
                tdBuffer += new String(arg0);
            }
        }
    
        public void handleStartTag(Tag tag, MutableAttributeSet arg1, int arg2) {
             if (tag == HTML.Tag.TABLE){
                 s.add(new TableParser.HTMLTable());
             } else if (tag == HTML.Tag.TR){
                 s.add(new TableParser.HTMLRow());
             } else if (tag == HTML.Tag.TD){
                 inTD = true;
                 tdBuffer = "";
             }
        }
    
        public void handleEndTag(Tag tag, int arg1) {
             if (tag == HTML.Tag.TABLE){
                 TableParser.HTMLTable T = (TableParser.HTMLTable)s.pop();
                 if (s.size() == 0){
                     TableParser.this.add(T);
                 } else if (s.peek() instanceof HTMLRow){
                     ((HTMLRow)s.peek()).add(T);
                 } else {
                     logger.error("Need to be within nothing or a cell/row");
                 }
             } else if (tag == HTML.Tag.TR){
                 HTMLRow r = (HTMLRow)s.pop();
                 ((TableParser.HTMLTable)s.peek()).add(r);
             } else if (tag == HTML.Tag.TD){
                 if (inTD){
                     ((TableParser.HTMLRow)s.peek()).add(tdBuffer);
                     inTD = false;
                 }
             }
         }
    
    };

    public class HTMLTable extends ArrayList{}

    public class HTMLRow extends ArrayList{}

}

powered by performancing firefox