首页新闻招聘找找看知识库
  • 回复:1 浏览:9250 2008-05-27 22:42 来自 blueyund

    我用以下函数来得到网页的html
    可是得不到全部html啊,我把byte[] MyBytes = new byte[1024];
    改成byte[] MyBytes = new byte[10240000];还是得不到全部html
    而且byte[] MyBytes = new byte[204800];比上面的得到的html还多
    这是怎么回事啊,html字节到底是怎么算的?

    public string getHTML(string url)
            {
                try
                {                
                    WebRequest webRequest = WebRequest.Create(url);
                    WebResponse webResponse = webRequest.GetResponse();
                    //确定该文件是二进制文件还是文本文件,网页是文本文件
                    if (!webResponse.ContentType.ToLower().StartsWith("text/"))
                    {
                        return null;
                    }
                    Stream stream = webResponse.GetResponseStream();
                    byte[] MyBytes = new byte[1024];
                    int MyBytesRead = stream.Read(MyBytes, 0, MyBytes.Length);               
                    string html = System.Text.Encoding.Default.GetString(MyBytes, 0, MyBytesRead);
                    //string html = System.Text.Encoding.Convert(Encoding.GetEncoding("byte"),Encoding.Default,MyBytes);  
                    //判断编码,若为utf-8则重新读取
                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                    if (reg_charset.IsMatch(html))
                    {
                        if (reg_charset.Match(html).Groups["charset"].Value == "utf-8")
                        {
                            //html = Encoding.Convert(Encoding.Default,Encoding.UTF8,MyBytes).ToString();
                            html = System.Text.Encoding.UTF8.GetString(MyBytes, 0, MyBytesRead);
                        }
                    }
                    return html;
                }
                catch (UriFormatException ex)
                {

                    Console.WriteLine(ex.Message);
                    return null;
                }
                catch (WebException ex)
                {

                    Console.WriteLine(ex.Message);
                    return null;
                }
            }

登录后才能评论,请先登录注册