-
我用以下函数来得到网页的html
可是得不到全部html啊,我把byte[] MyBytes = new byte[1024];
改成byte[] MyBytes = new byte[10240000];还是得不到全部html
而且byte[] MyBytes = new byte[204800];比上面的得到的html还多
这是怎么回事啊,html字节到底是怎么算的?public string getHTML(string url)
{
try
{
WebRequest webRequest = WebRequest.Create(url);
WebResponse webResponse = webRequest.GetResponse();
//确定该文件是二进制文件还是文本文件,网页是文本文件
if (!webResponse.ContentType.ToLower().StartsWith("text/"))
{
return null;
}
Stream stream = webResponse.GetResponseStream();
byte[] MyBytes = new byte[1024];
int MyBytesRead = stream.Read(MyBytes, 0, MyBytes.Length);
string html = System.Text.Encoding.Default.GetString(MyBytes, 0, MyBytesRead);
//string html = System.Text.Encoding.Convert(Encoding.GetEncoding("byte"),Encoding.Default,MyBytes);
//判断编码,若为utf-8则重新读取
Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
if (reg_charset.Match(html).Groups["charset"].Value == "utf-8")
{
//html = Encoding.Convert(Encoding.Default,Encoding.UTF8,MyBytes).ToString();
html = System.Text.Encoding.UTF8.GetString(MyBytes, 0, MyBytesRead);
}
}
return html;
}
catch (UriFormatException ex)
{Console.WriteLine(ex.Message);
return null;
}
catch (WebException ex)
{Console.WriteLine(ex.Message);
return null;
}
}