`
jeafyezheng
  • 浏览: 99596 次
  • 性别: Icon_minigender_1
  • 来自: 北京
最近访客 更多访客>>
文章分类
社区版块
存档分类
最新评论

如何用Java读取一个离线的网页而不产生乱码

阅读更多
package tools;

import java.io.*;
import java.io.BufferedInputStream;
import java.nio.charset.Charset;
import java.util.Map;
/**
 * <p>Title: </p>
 * <p>Description: </p>
 * <p>Copyright: Copyright (c) 2006</p>
 * <p>Company: </p>
 * @author yezh
 * @version 1.0
 */
//该类作用读取分割后的天网语料,不管网页是什么都可以
//但当网页中没有charset时,都默认我gbk编码,所以可能也有错误

public class ReadHtml {
  private static int num = 40;//charset可能在行数,否则认为没有,默认gbk
  private static Map map = Charset.availableCharsets();
public static String ReadAllKindOfCharset(String sfile)
{
    String myfile = sfile;
  File file = new File(myfile);
  return ReadAllKindOfCharset(file);
}
public static String ReadAllKindOfCharset(File file)
{

  FileInputStream instream;
  String charset = "gbk";
  int c;
  InputStreamReader reader;
  BufferedInputStream bin;
  BufferedReader br;
  StringBuffer buffer = new StringBuffer(4096);//开始用来寻找charset,然后用来缓存的文件
  try {
    instream = new FileInputStream(file);
    bin = new BufferedInputStream(instream);
    bin.mark(bin.available());
    for(int i =0; i< num && bin.available() > 0; i++)
    {
      buffer.setLength(0);
      while((c = bin.read()) != -1 && c!=10 &&  c!= 13)
      {
        buffer.append((char) c);
        //System.out.print((char) c);
      }

      if(c == 13)//对windows换行的处理
      {
        bin.read();
      }
      String line = buffer.toString().toLowerCase();
      int pos = line.indexOf("charset");
      if( pos != -1)
      {
        StringBuffer charsetbuffer = new StringBuffer();
        for( pos =pos +7; pos < line.length(); pos++)
        {
          char ch = line.charAt(pos);
          if( ch== '=')
          {

          }
          else if(ch == '"' || ch == '\'' || ch == ';' )
          {
            break;
          }
          else
          {
            charsetbuffer.append(ch);
          }

        }

        String tempcharset = charsetbuffer.toString().trim();

        if(tempcharset.length()>2)
        {
          charset = tempcharset;
        }

        //System.out.println("charset: " + tempcharset);
        break;
      }
      if (c == -1) {
        break;
      }


    }

    if (!map.containsKey(charset) || !map.containsValue(charset)) { //不支持时可以在此处理
      charset = "gbk";
    }

    bin.reset();
    String line = buffer.toString();
    buffer.setLength(0);
    //instream = new FileInputStream(file);
    //bin = new BufferedInputStream(instream);
    reader = new InputStreamReader(bin, charset);
    br = new BufferedReader(reader);
    while((line = br.readLine()) != null)
    {
      buffer.append(line + '\n');
    }

  }
  catch (Exception ex) {
    System.out.println("读取文件出错");
    System.out.println(charset);
  }
  //System.out.println(buffer.toString());
  return buffer.toString();
}
//
public static void processDirectory(String Directory)
  {
    File dir = new File(Directory);
    if(dir.isDirectory())
    {
      File files[] = dir.listFiles();
      for(int i = 0; i < files.length; i++)
      {
        if(files[i].isFile())
        {
          ReadAllKindOfCharset(files[i]);
        }
        else if(files[i].isDirectory())
        {
          processDirectory(files[i].getName());
        }
      }

    }

 }


  //测试
  public static void main(String args[])
  {
    long start = System.currentTimeMillis();
    System.out.println(ReadHtml.ReadAllKindOfCharset(("luan/arrow.com.cnpdadianyingdefault.asp.htm")));
    //ReadHtml.ReadAllKindOfCharset(("luan/编辑1.htm"));
    //ReadHtml.processDirectory("D:\\javap\\天网\\myJava\\out");
    System.out.println("time = " + (System.currentTimeMillis() - start));
    if(Charset.isSupported("gb_2312-80"))
    {
       System.out.println("true");
       System.out.println("·");
    }
  }
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics