使用HtmlParser抽取页面所有文本数据的方法总结

loveofgod

浏览: 742618 次
性别:
来自: 北京

最近访客更多访客>>

u012363178

piggysnoopy

落林是尘啊

dev灰色天空

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Open source

Eclipse JSP HTML

方法test1应该说是最有效的，避免了出现很多空格。
package   test;


import   java.io.BufferedReader;
import   java.io.File;
import   java.io.FileInputStream;
import   java.io.InputStreamReader;

import   org.htmlparser.Node;
import   org.htmlparser.NodeFilter;
import   org.htmlparser.Parser;
import   org.htmlparser.filters.NodeClassFilter;
import   org.htmlparser.filters.OrFilter;
import   org.htmlparser.nodes.TextNode;
import   org.htmlparser.parserapplications.StringExtractor;
import   org.htmlparser.tags.LinkTag;
import   org.htmlparser.util.NodeList;
import   org.htmlparser.util.ParserException;
import   org.htmlparser.visitors.HtmlPage;
import   org.htmlparser.visitors.TextExtractingVisitor;


/**
    *   演示了Html   Parse的应用.
    */

public   class   ParseHtmlTest
{

          public   static   void   main(String[]   args)   throws   Exception
          {
                  String   aFile   =   "D:\\Eclipse\\workspace\\search\\test001\\content_1349887.htm";

                  String   content   =   readTextFile(aFile,   "GBK");
                  StringExtractor   se;
                  se   =   new   StringExtractor   (aFile);
                  System.out.println(se.extractStrings(false));

                  test1(content);
                  System.out.println("=====Test1==============================");

                  test2(content);
                  System.out.println("=====Test2==========================");

                  test3(content);
                  System.out.println("=====Test3===============================");

                  test4(content);
                  System.out.println("=====Test4===============================");

                  test5(aFile);
                System.out.println("======Test5==============================");

                  //访问外部资源,相对慢
                  test5("http://www.medlink.com.cn");
                  System.out.println("====================================");

          }

          /**
            *   读取文件的方式来分析内容.
            *   filePath也可以是一个Url.
            *
            *   @param   resource   文件/Url
            */
          public   static   void   test5(String   resource)   throws   Exception
          {
                  Parser   myParser   =   new   Parser(resource);

                  //设置编码
                  myParser.setEncoding("GBK");

                  HtmlPage   visitor   =   new   HtmlPage(myParser);

                  myParser.visitAllNodesWith(visitor);

                  String   textInPage   =   visitor.getBody().toString();

                  System.out.println(textInPage);
          }

          /**
            *   按页面方式处理.对一个标准的Html页面,推荐使用此种方式.
            */
          public   static   void   test4(String   content)   throws   Exception
          {
                  Parser   myParser;
                  myParser   =   Parser.createParser(content,   "GBK");

                  HtmlPage   visitor   =   new   HtmlPage(myParser);

                  myParser.visitAllNodesWith(visitor);

                  String   textInPage   =   visitor.getTitle();

                  System.out.println(textInPage);
                  System.out.println("－－－－－－－－－－－－－－－－－－－－");
                  System.out.println(visitor.getBody());
          }

          /**
            *   利用Visitor模式解析html页面.
            *
            *   小优点:翻译了<>等符号
            *   缺点:好多空格,无法提取link
            *
            */
          public   static   void   test3(String   content)   throws   Exception
          {
                  Parser   myParser;
                  myParser   =   Parser.createParser(content,   "GBK");

                  TextExtractingVisitor   visitor   =   new   TextExtractingVisitor();

                  myParser.visitAllNodesWith(visitor);

                  String   textInPage   =   visitor.getExtractedText();

                  System.out.println(textInPage);
          }

          /**
            *   得到普通文本和链接的内容.
            *
            *   使用了过滤条件.
            */
          public   static   void   test2(String   content)   throws   ParserException
          {
                  Parser   myParser;
                  NodeList   nodeList   =   null;

                  myParser   =   Parser.createParser(content,   "GBK");

                  NodeFilter   textFilter   =   new   NodeClassFilter(TextNode.class);
                  NodeFilter   linkFilter   =   new   NodeClassFilter(LinkTag.class);

                  //暂时不处理   meta
                  //NodeFilter   metaFilter   =   new   NodeClassFilter(MetaTag.class);

                  OrFilter   lastFilter   =   new   OrFilter();
                  lastFilter.setPredicates(new   NodeFilter[]   {   textFilter,   linkFilter   });

                  nodeList   =   myParser.parse(lastFilter);

                  Node[]   nodes   =   nodeList.toNodeArray();

                  for   (int   i   =   0;   i   <   nodes.length;   i++)
                  {
                          Node   anode   =   (Node)   nodes[i];

                          String   line   =   "";
                          if   (anode   instanceof   TextNode)
                          {
                                  TextNode   textnode   =   (TextNode)   anode;
                                  line   =   textnode.toPlainTextString().trim();
                                  //line   =   textnode.getText();
                          }
//                         else   if   (anode   instanceof   LinkTag)
//                         {
//                                 LinkTag   linknode   =   (LinkTag)   anode;
//
//                                 line   =   linknode.getLink();
//                                 //@todo   过滤jsp标签:可以自己实现这个函数
//                                 //line   =   StringFunc.replace(line,   "<%.*%>",   "");
//                         }

                          if   (isTrimEmpty(line))
                                  continue;

                          System.out.println(line);
                  }
          }

          /**
            *   解析普通文本节点.
            *
            *   @param   content
            *   @throws   ParserException
            */
          public   static   void   test1(String   content)   throws   ParserException
          {
                  Parser   myParser;
                  Node[]   nodes   =   null;

                  myParser   =   Parser.createParser(content,   null);

                  nodes   =   myParser.extractAllNodesThatAre(TextNode.class);   //exception   could   be   thrown   here

                  for   (int   i   =   0;   i   <   nodes.length;   i++)
                  {
                          TextNode   textnode   =   (TextNode)   nodes[i];
                          String   line   =   textnode.toPlainTextString().trim();
                          if   (line.equals(""))
                                  continue;
                          System.out.println(line);
                  }

          }

          /**
            *   读取一个文件到字符串里.
            *
            *   @param   sFileName     文件名
            *   @param   sEncode       String
            *   @return   文件内容
            */
          public   static   String   readTextFile(String   sFileName,   String   sEncode)
          {
                  StringBuffer   sbStr   =   new   StringBuffer();

                  try
                  {
                          File   ff   =   new   File(sFileName);
                          InputStreamReader   read   =   new   InputStreamReader(new   FileInputStream(ff),
                                          sEncode);
                          BufferedReader   ins   =   new   BufferedReader(read);

                          String   dataLine   =   "";
                          while   (null   !=   (dataLine   =   ins.readLine()))
                          {
                                  sbStr.append(dataLine);
                                  sbStr.append("\r\n");
                          }

                          ins.close();
                  }
                  catch   (Exception   e)
                  {
                        //   LogMan.error("read   Text   File   Error",   e);
                  }

                  return   sbStr.toString();
          }

          /**
            *   去掉左右空格后字符串是否为空
            *   @param   astr   String
            *   @return   boolean
            */
          public   static   boolean   isTrimEmpty(String   astr)
          {
                  if   ((null   ==   astr)   ||   (astr.length()   ==   0))
                  {
                          return   true;
                  }
                  if   (isBlank(astr.trim()))
                  {
                          return   true;
                  }
                  return   false;
          }

          /**
            *   字符串是否为空:null或者长度为0.
            *   @param   astr   源字符串.
            *   @return   boolean
            */
          public   static   boolean   isBlank(String   astr)
          {
                  if   ((null   ==   astr)   ||   (astr.length()   ==   0))
                  {
                          return   true;
                  }
                  else
                  {
                          return   false;
                  }
          }

}

分享到：

jni.h所在位置 | HttpClient中发送中文参数get请求出现乱码 ...

2008-04-24 05:59
浏览 4452
评论(1)
查看更多

1 楼 dd2086 2009-03-10

希望继续研究

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论