Java HTML Table parser

I put together this class for scraping HTML tables. It supports nested tables. At some stage I’ll put this up in SVN along with it’s test cases.

import java.io.IOException;

import java.io.Reader;

import java.util.ArrayList;

import java.util.Stack;import javax.swing.text.MutableAttributeSet;

import javax.swing.text.html.HTML;

import javax.swing.text.html.HTMLEditorKit;

import javax.swing.text.html.HTML.Tag;

import javax.swing.text.html.parser.ParserDelegator;

import org.apache.log4j.Logger;

public class TableParser extends ArrayList {

 static Logger logger = Logger.getLogger(TableParser.class);

 Stack s = new Stack();

 /**	 * Process this reader.

  * @param f

  * @throws IOException

  */	public void parse(Reader f) throws IOException {

 	this.clear();

 	new ParserDelegator().parse(f, parser, false);

 }

private HTMLEditorKit.ParserCallback parser = new HTMLEditorKit.ParserCallback(){

private boolean inTD;

 	private String tdBuffer;

public void handleError(String arg0, int arg1) {

//			System.out.println("Error "+arg0);

 		// TODO Auto-generated method stub

 		super.handleError(arg0, arg1);

 	}

public void handleText(char[] arg0, int arg1) {

 		if (inTD){

 			tdBuffer += new String(arg0);

 		}

 	}

public void handleStartTag(Tag tag, MutableAttributeSet arg1, int arg2) {

 		if (tag == HTML.Tag.TABLE){

 			s.add(new TableParser.HTMLTable());

 		} else if (tag == HTML.Tag.TR){

 			s.add(new TableParser.HTMLRow());

 		} else if (tag == HTML.Tag.TD){

 			inTD = true;

 			tdBuffer = "";

 		}

 	}

public void handleEndTag(Tag tag, int arg1) {

 		if (tag == HTML.Tag.TABLE){

 			TableParser.HTMLTable T = (TableParser.HTMLTable)s.pop();

 			if (s.size() == 0){

 				TableParser.this.add(T);

 			} else if (s.peek() instanceof HTMLRow){

 				((HTMLRow)s.peek()).add(T);

 			} else {

 				logger.error("Need to be within nothing or a cell/row");

 			}

 		} else if (tag == HTML.Tag.TR){

 			HTMLRow r = (HTMLRow)s.pop();

 			((TableParser.HTMLTable)s.peek()).add(r);

 		} else if (tag == HTML.Tag.TD){

 			if (inTD){

 				((TableParser.HTMLRow)s.peek()).add(tdBuffer);

 				inTD = false;

 			}

 		}

 	}

};

public class HTMLTable extends ArrayList{}

 public class HTMLRow extends ArrayList{}

}

powered by performancing firefox

This entry was posted in java. Bookmark the permalink.

3 Responses to Java HTML Table parser

  1. Al Anelli says:

    It looks like you scrape the cells into arrays (by rows?), but I’m a bit of a java newbie and I can’t see where the arrays are filled or how to access the data in the stack(s). I need to parse each cell to get the inside text and I’d love to know where to perform that method. An example of this in use with a complete working program would be EXCELLENT.

    Al

  2. Aulis says:

    Completely worthless without usage demo.

  3. mebel harkov says:

    Cool! It looks like what I was looking for. I will download and try it now… Hopefully, it works.

Leave a Reply