HTML Parserラッパ
HTML Parser - HTML Parserのラッパを作り始めた。
・http://journal.mycom.co.jp/news/2005/06/15/029.html
方針は以下のような感じ
import org.htmlparser._ import org.htmlparser.util._ import org.htmlparser.filters._ //---------------------------------------------------- object HttpParser { object CSSSelector { def apply(selector:String) = new CssSelectorNodeFilter(selector) } class MyNodeFilter(nodeFilter:NodeFilter) { def |(q:NodeFilter) = new OrFilter(nodeFilter, q) } implicit def NodeSelectorToExtNodeSelector(p:NodeFilter) = new MyNodeFilter(p) class MyParser(parser:Parser) { def each(selector:NodeFilter)(f:Node => Unit) = { val nodelist = parser.parse(selector) val nodeit = nodelist.elements while (nodeit.hasMoreNodes) { val node = nodeit.nextNode f(node) } } } implicit def ParserToExtParser(p:Parser) = new MyParser(p) class MyTag(tag:Tag) { def klass = tag.getAttribute("class") } implicit def TagToExtTag(t:Tag) = new MyTag(t) class MyString(str:String) { def strip = str.replaceAll("(^\\s*|\\s*$)", "") } implicit def StringToExtString(s:String) = new MyString(s) } //---------------------------------------------------- object lingr_log_getter extends Application { import HttpParser._ val url = "http://www.lingr.com/room/scala-ja/archives/2008/02/24" val parser = new Parser(url) val handleSelector = CSSSelector("#messages .handleText") val msgSelector = CSSSelector("#messages .messageTextContainer") val timestampSelector = CSSSelector("#messages .timestamp") val selector = (handleSelector | msgSelector | timestampSelector) parser.each(selector){node => val n = node.getFirstChild val str = if (n != null) n.getText.strip else "" node match { case n:Tag if n.klass == "handleText" => println("[" + str + "]") case n:Tag if n.klass == "timestamp" => if (str != "") println("\n<" + str + ">") case n:Tag if n.klass == "messageTextContainer" => { if (n.getChildren.size > 1) { println("-----") println(n.getChildren.asString.strip) println("-----") } else println(" " + str) } } } }