我正在创建一个 C# 4.0 应用程序来使用 Web 客户端下载网页内容。
网络客户端功能
public static string GetDocText(string url)
{
string html = string.Empty;
try
{
using (ConfigurableWebClient client = new ConfigurableWebClient())
{
/* Set timeout for webclient */
client.Timeout = 600000;
/* Build url */
Uri innUri = null;
if (!url.StartsWith("http://"))
url = "http://" + url;
Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out innUri);
try
{
client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR " + "3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2; AskTbFXTV5/5.15.4.23821; BRI/2)");
client.Headers.Add("Vary", "Accept-Encoding");
client.Encoding = Encoding.UTF8;
html = client.DownloadString(innUri);
if (html.Contains("Pagina non disponibile"))
{
string str = "site blocked";
str = "";
}
if (string.IsNullOrEmpty(html))
{
return string.Empty;
}
else
{
return html;
}
}
catch (Exception ex)
{
return "";
}
finally
{
client.Dispose();
}
}
}
catch (Exception ex)
{
return "";
}
}
public class ConfigurableWebClient : WebClient
{
public int? Timeout { get; set; }
public int? ConnectionLimit { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
var baseRequest = base.GetWebRequest(address);
var webRequest = baseRequest as HttpWebRequest;
if (webRequest == null)
return baseRequest;
if (Timeout.HasValue)
webRequest.Timeout = Timeout.Value;
if (ConnectionLimit.HasValue)
webRequest.ServicePoint.ConnectionLimit = ConnectionLimit.Value;
return webRequest;
}
}
我检查了 C# Web 客户端中的下载内容,它与浏览器略有不同
内容。我在浏览器(Mozilla Firefox)和我的网络客户端功能中给出了相同的 URL。
网页正确显示内容,但我的 Web 客户端 DownloadString 返回另一个
HTML。请参阅下面我的网络客户端回复。
Web客户端下载的html
<!DOCTYPE html>
<head>
<META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW">
<meta http-equiv="cache-control" content="max-age=0" />
<meta http-equiv="cache-control" content="no-cache" />
<meta http-equiv="expires" content="0" />
<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
<meta http-equiv="pragma" content="no-cache" />
<meta http-equiv="refresh" content="10; url=/distil_r_captcha.html?Ref=/pgol/4-abbigliamento/3-Roma%20%28RM%29/p-7&distil_RID=A8D2F8B6-B314-11E3-A5E9-E04C5DBA1712" />
<script type="text/javascript" src="/ga.280243267228712.js?PID=6D4E4D1D-7094-375D-A439-0568A6A70836" defer></script><style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#glance7ca96c1b,#hiredf795fe70,#target01a7c05a,#hiredf795fe70{display:none!important}</style></head>
<body>
<div id="distil_ident_block"> </div>
<div id="d__fFH"><OBJECT id="d_dlg" CLASSID="clsid:3050f819-98b5-11cf-bb82-00aa00bdce0b" width="0px" height="0px"></OBJECT><span id="d__fF"></span></div></body>
</html>
我的问题是我的 Webclient 函数没有返回实际的网页内容。