抓取和
分析網(wǎng)頁(yè)的類(lèi)。
主要功能有:
1、提取網(wǎng)頁(yè)的純文本,去所有html標簽和javascript代碼
2、提取網(wǎng)頁(yè)的鏈接,包括href和frame及iframe
3、提取網(wǎng)頁(yè)的title等(其它的標簽可依此類(lèi)推,正則是一樣的)
4、可以實(shí)現簡(jiǎn)單的表單提交及cookie保存
- <STRONG class=c>
-
-
- </STRONG>
-
- <STRONG class=k>using</STRONG> System;
- <STRONG class=k>using</STRONG> System.Data;
- <STRONG class=k>using</STRONG> System.Configuration;
- <STRONG class=k>using</STRONG> System.Net;
- <STRONG class=k>using</STRONG> System.IO;
- <STRONG class=k>using</STRONG> System.Text;
- <STRONG class=k>using</STRONG> System.Collections.Generic;
- <STRONG class=k>using</STRONG> System.Text.RegularExpressions;
- <STRONG class=k>using</STRONG> System.Threading;
- <STRONG class=k>using</STRONG> System.Web;
- <STRONG class=c>
- </STRONG><STRONG class=c>
- </STRONG><STRONG class=c>
- </STRONG><STRONG class=k>public</STRONG> <STRONG class=k>class</STRONG> WebPage
- {
- <STRONG class=r>
- #region 私有成員
- </STRONG> <STRONG class=k>private</STRONG> Uri m_uri; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> List<Link> m_links; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> m_title; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> m_html; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> m_outstr; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>bool</STRONG> m_good; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>int</STRONG> m_pagesize; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>static</STRONG> Dictionary<<STRONG class=k>string</STRONG>, CookieContainer> webcookies = <STRONG class=k>new</STRONG> Dictionary<<STRONG class=k>string</STRONG>, CookieContainer>();<STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> m_post; <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> m_loginurl; <STRONG class=c>
- </STRONG><STRONG class=r> #endregion
- </STRONG><STRONG class=r>
-
- #region 私有方法
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> List<Link> getLinks()
- {
- <STRONG class=k>if</STRONG> (m_links.Count == 0)
- {
- Regex[] regex = <STRONG class=k>new</STRONG> Regex[2];
- regex[0] = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\W)*?)</"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase);
- regex[1] = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase);
- <STRONG class=k>for</STRONG> (<STRONG class=k>int</STRONG> i = 0; i < 2; i++)
- {
- Match match = regex[i].Match(m_html);
- <STRONG class=k>while</STRONG> (match.Success)
- {
- <STRONG class=k>try</STRONG>
- {
- <STRONG class=k>string</STRONG> url = <STRONG class=k>new</STRONG> Uri(m_uri, match.Groups[<STRONG class=s>"url"</STRONG>].Value).AbsoluteUri;
- <STRONG class=k>string</STRONG> text = <STRONG class=s>""</STRONG>;
- <STRONG class=k>if</STRONG> (i == 0) text = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"(<[^>]+>)|(\\s)|( )|&|\""</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups[<STRONG class=s>"text"</STRONG>].Value, <STRONG class=s>""</STRONG>);
- Link link = <STRONG class=k>new</STRONG> Link(url, text);
- m_links.Add(link);
- }
- <STRONG class=k>catch</STRONG>(Exception ex){Console.WriteLine(ex.Message); };
- match = match.NextMatch();
- }
- }
- }
- <STRONG class=k>return</STRONG> m_links;
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>string</STRONG> getFirstNchar(<STRONG class=k>string</STRONG> instr, <STRONG class=k>int</STRONG> firstN, <STRONG class=k>bool</STRONG> withLink)
- {
- <STRONG class=k>if</STRONG> (m_outstr == <STRONG class=s>""</STRONG>)
- {
- m_outstr = instr.Clone() <STRONG class=k>as</STRONG> <STRONG class=k>string</STRONG>;
- m_outstr = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, <STRONG class=s>""</STRONG>);
- m_outstr = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, <STRONG class=s>""</STRONG>);
- m_outstr = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, <STRONG class=s>""</STRONG>);
- <STRONG class=k>if</STRONG> (!withLink) m_outstr = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, <STRONG class=s>""</STRONG>);
- Regex objReg = <STRONG class=k>new</STRONG> System.Text.RegularExpressions.Regex(<STRONG class=s>"(<[^>]+?>)| "</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase);
- m_outstr = objReg.Replace(m_outstr, <STRONG class=s>""</STRONG>);
- Regex objReg2 = <STRONG class=k>new</STRONG> System.Text.RegularExpressions.Regex(<STRONG class=s>"(\\s)+"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase);
- m_outstr = objReg2.Replace(m_outstr, <STRONG class=s>" "</STRONG>);
- }
- <STRONG class=k>return</STRONG> m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>private</STRONG> <STRONG class=k>uint</STRONG> getuintFromIP(IPAddress x)
- {
- Byte[] bt = x.GetAddressBytes();
- <STRONG class=k>uint</STRONG> i = (<STRONG class=k>uint</STRONG>)(bt[0] * 256 * 256 * 256);
- i += (<STRONG class=k>uint</STRONG>)(bt[1] * 256 * 256);
- i += (<STRONG class=k>uint</STRONG>)(bt[2] * 256);
- i += (<STRONG class=k>uint</STRONG>)(bt[3]);
- <STRONG class=k>return</STRONG> i;
- }
- <STRONG class=r>
- #endregion
- </STRONG><STRONG class=r>
-
- #region 公有文法
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> getContext(<STRONG class=k>int</STRONG> firstN)
- {
- <STRONG class=k>return</STRONG> getFirstNchar(m_html, firstN, <STRONG class=k>true</STRONG>);
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> getContextWithOutLink(<STRONG class=k>int</STRONG> firstN)
- {
- <STRONG class=k>return</STRONG> getFirstNchar(m_html, firstN, <STRONG class=k>false</STRONG>);
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> List<Link> getSpecialLinksByUrl(<STRONG class=k>string</STRONG> pattern,<STRONG class=k>int</STRONG> count)
- {
- <STRONG class=k>if</STRONG>(m_links.Count==0)getLinks();
- List<Link> SpecialLinks = <STRONG class=k>new</STRONG> List<Link>();
- List<Link>.Enumerator i;
- i = m_links.GetEnumerator();
- <STRONG class=k>int</STRONG> cnt = 0;
- <STRONG class=k>while</STRONG> (i.MoveNext() && cnt<count)
- {
- <STRONG class=k>if</STRONG> (<STRONG class=k>new</STRONG> Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success)
- {
- SpecialLinks.Add(i.Current);
- cnt++;
- }
- }
- <STRONG class=k>return</STRONG> SpecialLinks;
- }
-
-
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> List<Link> getSpecialLinksByText(<STRONG class=k>string</STRONG> pattern,<STRONG class=k>int</STRONG> count)
- {
- <STRONG class=k>if</STRONG> (m_links.Count == 0) getLinks();
- List<Link> SpecialLinks = <STRONG class=k>new</STRONG> List<Link>();
- List<Link>.Enumerator i;
- i = m_links.GetEnumerator();
- <STRONG class=k>int</STRONG> cnt = 0;
- <STRONG class=k>while</STRONG> (i.MoveNext() && cnt < count)
- {
- <STRONG class=k>if</STRONG> (<STRONG class=k>new</STRONG> Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success)
- {
- SpecialLinks.Add(i.Current);
- cnt++;
- }
- }
- <STRONG class=k>return</STRONG> SpecialLinks;
- }
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> List<Link> getSpecialLinksByIP(<STRONG class=k>string</STRONG> _ip_start, <STRONG class=k>string</STRONG> _ip_end)
- {
- IPAddress ip_start = IPAddress.Parse(_ip_start);
- IPAddress ip_end = IPAddress.Parse(_ip_end);
- <STRONG class=k>if</STRONG> (m_links.Count == 0) getLinks();
- List<Link> SpecialLinks = <STRONG class=k>new</STRONG> List<Link>();
- List<Link>.Enumerator i;
- i = m_links.GetEnumerator();
- <STRONG class=k>while</STRONG> (i.MoveNext())
- {
- IPAddress ip;
- <STRONG class=k>try</STRONG>
- {
- ip = Dns.GetHostEntry(<STRONG class=k>new</STRONG> Uri(i.Current.url).Host).AddressList[0];
- }
- <STRONG class=k>catch</STRONG> { <STRONG class=k>continue</STRONG>; }
- <STRONG class=k>if</STRONG>(getuintFromIP(ip)>=getuintFromIP(ip_start) && getuintFromIP(ip)<=getuintFromIP(ip_end))
- {
- SpecialLinks.Add(i.Current);
- }
- }
- <STRONG class=k>return</STRONG> SpecialLinks;
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> getSpecialWords(<STRONG class=k>string</STRONG> pattern)
- {
- <STRONG class=k>if</STRONG> (m_outstr == <STRONG class=s>""</STRONG>) getContext(Int16.MaxValue);
- Regex regex = <STRONG class=k>new</STRONG> Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase );
- Match mc=regex.Match(m_outstr);
- <STRONG class=k>if</STRONG> (mc.Success)
- <STRONG class=k>return</STRONG> mc.Groups[1].Value;
- <STRONG class=k>return</STRONG> <STRONG class=k>string</STRONG>.Empty;
- }
- <STRONG class=r> #endregion
- </STRONG><STRONG class=r>
-
- #region 構造函數
- </STRONG>
- <STRONG class=k>private</STRONG> <STRONG class=k>void</STRONG> Init(<STRONG class=k>string</STRONG> _url)
- {
-
- <STRONG class=k>try</STRONG>
- {
- m_uri = <STRONG class=k>new</STRONG> Uri(_url);
- m_links = <STRONG class=k>new</STRONG> List<Link>();
- m_html = <STRONG class=s>""</STRONG>;
- m_outstr = <STRONG class=s>""</STRONG>;
- m_title = <STRONG class=s>""</STRONG>;
- m_good = <STRONG class=k>true</STRONG>;
- <STRONG class=k>if</STRONG> (_url.EndsWith(<STRONG class=s>".rar"</STRONG>) || _url.EndsWith(<STRONG class=s>".dat"</STRONG>) || _url.EndsWith(<STRONG class=s>".msi"</STRONG>))
- {
- m_good = <STRONG class=k>false</STRONG>;
- <STRONG class=k>return</STRONG>;
- }
- HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
- rqst.AllowAutoRedirect = <STRONG class=k>true</STRONG>;
- rqst.MaximumAutomaticRedirections = 3;
- rqst.UserAgent = <STRONG class=s>"Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"</STRONG>;
- rqst.KeepAlive = <STRONG class=k>true</STRONG>;
- rqst.Timeout = 30000;
- <STRONG class=k>lock</STRONG> (WebPage.webcookies)
- {
- <STRONG class=k>if</STRONG> (WebPage.webcookies.ContainsKey(m_uri.Host))
- rqst.CookieContainer = WebPage.webcookies[m_uri.Host];
- <STRONG class=k>else</STRONG>
- {
- CookieContainer cc = <STRONG class=k>new</STRONG> CookieContainer();
- WebPage.webcookies[m_uri.Host] = cc;
- rqst.CookieContainer = cc;
- }
- }
-
- HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
-
- Stream sm = rsps.GetResponseStream();
- <STRONG class=k>if</STRONG> (!rsps.ContentType.ToLower().StartsWith(<STRONG class=s>"text/"</STRONG>) || rsps.ContentLength > 1 << 22)
- {
- rsps.Close();
- m_good = <STRONG class=k>false</STRONG>;
- <STRONG class=k>return</STRONG>;
- }
- Encoding cding = System.Text.Encoding.Default;
- <STRONG class=k>string</STRONG> contenttype=rsps.ContentType.ToLower();
- <STRONG class=k>int</STRONG> ix = contenttype.IndexOf(<STRONG class=s>"charset="</STRONG>);
- <STRONG class=k>if</STRONG> (ix != -1)
- {
-
- <STRONG class=k>try</STRONG>
- {
- cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + <STRONG class=s>"charset"</STRONG>.Length + 1));
- }
- <STRONG class=k>catch</STRONG>
- {
- cding = Encoding.Default;
- }
- m_html = <STRONG class=k>new</STRONG> StreamReader(sm, cding).ReadToEnd();
- }
- <STRONG class=k>else</STRONG>
- {
- m_html = <STRONG class=k>new</STRONG> StreamReader(sm, cding).ReadToEnd();
- Regex regex = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"charset=(?<cding>[^=]+)?\""</STRONG>,RegexOptions.IgnoreCase);
- <STRONG class=k>string</STRONG> strcding = regex.Match(m_html).Groups[<STRONG class=s>"cding"</STRONG>].Value;
- <STRONG class=k>try</STRONG>
- {
- cding = Encoding.GetEncoding(strcding);
- }
- <STRONG class=k>catch</STRONG>{
- cding = Encoding.Default;
- }
- <STRONG class=k>byte</STRONG>[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());
- m_html = cding.GetString(bytes);
- <STRONG class=k>if</STRONG> (m_html.Split(<STRONG class=s>'?'</STRONG>).Length > 100)
- {
- m_html=Encoding.Default.GetString(bytes);
- }
- }
-
-
- m_pagesize = m_html.Length;
- m_uri = rsps.ResponseUri;
- rsps.Close();
- }
- <STRONG class=k>catch</STRONG> (Exception ex)
- {
- Console.WriteLine(ex.Message+m_uri.ToString());
- m_good = <STRONG class=k>false</STRONG>;
-
- }
- }
-
- <STRONG class=k>public</STRONG> WebPage(<STRONG class=k>string</STRONG> _url)
- {
- <STRONG class=k>string</STRONG> uurl = <STRONG class=s>""</STRONG>;
- <STRONG class=k>try</STRONG>
- {
- uurl = Uri.UnescapeDataString(_url);
- _url = uurl;
- }
- <STRONG class=k>catch</STRONG> { };
- Regex re = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"(?<h>[^\x00-\xff]+)"</STRONG>);
- Match mc = re.Match(_url);
- <STRONG class=k>if</STRONG> (mc.Success)
- {
- <STRONG class=k>string</STRONG> han = mc.Groups[<STRONG class=s>"h"</STRONG>].Value;
- _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(<STRONG class=s>"GB2312"</STRONG>)));
- }
-
- Init(_url);
- }
-
- <STRONG class=k>public</STRONG> WebPage(<STRONG class=k>string</STRONG> _url, <STRONG class=k>string</STRONG> _loginurl, <STRONG class=k>string</STRONG> _post)
- {
- <STRONG class=k>string</STRONG> uurl = <STRONG class=s>""</STRONG>;
- <STRONG class=k>try</STRONG>
- {
- uurl = Uri.UnescapeDataString(_url);
- _url = uurl;
- }
- <STRONG class=k>catch</STRONG> { };
- Regex re = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>"(?<h>[^\x00-\xff]+)"</STRONG>);
- Match mc = re.Match(_url);
- <STRONG class=k>if</STRONG> (mc.Success)
- {
- <STRONG class=k>string</STRONG> han = mc.Groups[<STRONG class=s>"h"</STRONG>].Value;
- _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(<STRONG class=s>"GB2312"</STRONG>)));
- }
- <STRONG class=k>if</STRONG> (_loginurl.Trim() == <STRONG class=s>""</STRONG> || _post.Trim() == <STRONG class=s>""</STRONG> || WebPage.webcookies.ContainsKey(<STRONG class=k>new</STRONG> Uri(_url).Host))
- {
- Init(_url);
- }
- <STRONG class=k>else</STRONG>
- {
- <STRONG class=r> #region 登陸
- </STRONG> <STRONG class=k>string</STRONG> indata = _post;
- m_post = _post;
- m_loginurl = _loginurl;
- <STRONG class=k>byte</STRONG>[] bytes = Encoding.Default.GetBytes(_post);
- CookieContainer myCookieContainer = <STRONG class=k>new</STRONG> CookieContainer();
- <STRONG class=k>try</STRONG>
- {
-
- <STRONG class=c>
- </STRONG>
- HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl);
- <STRONG class=c>
- </STRONG> myHttpWebRequest.ContentType = <STRONG class=s>"application/x-www-form-urlencoded"</STRONG>;
- myHttpWebRequest.AllowAutoRedirect = <STRONG class=k>false</STRONG>;
- myHttpWebRequest.UserAgent = <STRONG class=s>"Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"</STRONG>;
- myHttpWebRequest.Timeout = 60000;
- myHttpWebRequest.KeepAlive = <STRONG class=k>true</STRONG>;
- myHttpWebRequest.ContentLength = bytes.Length;
- myHttpWebRequest.Method = <STRONG class=s>"POST"</STRONG>;
- myHttpWebRequest.CookieContainer = myCookieContainer;
- <STRONG class=c>
- </STRONG> Stream myRequestStream = myHttpWebRequest.GetRequestStream();
- myRequestStream.Write(bytes, 0, bytes.Length);
- myRequestStream.Close();
- HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
-
- <STRONG class=k>foreach</STRONG> (Cookie ck <STRONG class=k>in</STRONG> myHttpWebResponse.Cookies)
- {
- myCookieContainer.Add(ck);
- }
- myHttpWebResponse.Close();
- }
- <STRONG class=k>catch</STRONG>
- {
- Init(_url);
- <STRONG class=k>return</STRONG>;
- }
- <STRONG class=r>
- #endregion
- </STRONG><STRONG class=r>
- #region 登陸后再訪(fǎng)問(wèn)頁(yè)面
- </STRONG> <STRONG class=k>try</STRONG>
- {
- m_uri = <STRONG class=k>new</STRONG> Uri(_url);
- m_links = <STRONG class=k>new</STRONG> List<Link>();
- m_html = <STRONG class=s>""</STRONG>;
- m_outstr = <STRONG class=s>""</STRONG>;
- m_title = <STRONG class=s>""</STRONG>;
- m_good = <STRONG class=k>true</STRONG>;
- <STRONG class=k>if</STRONG> (_url.EndsWith(<STRONG class=s>".rar"</STRONG>) || _url.EndsWith(<STRONG class=s>".dat"</STRONG>) || _url.EndsWith(<STRONG class=s>".msi"</STRONG>))
- {
- m_good = <STRONG class=k>false</STRONG>;
- <STRONG class=k>return</STRONG>;
- }
- HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
- rqst.AllowAutoRedirect = <STRONG class=k>true</STRONG>;
- rqst.MaximumAutomaticRedirections = 3;
- rqst.UserAgent = <STRONG class=s>"Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"</STRONG>;
- rqst.KeepAlive = <STRONG class=k>true</STRONG>;
- rqst.Timeout = 30000;
- rqst.CookieContainer = myCookieContainer;
- <STRONG class=k>lock</STRONG> (WebPage.webcookies)
- {
- WebPage.webcookies[m_uri.Host] = myCookieContainer;
- }
- HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
-
- Stream sm = rsps.GetResponseStream();
- <STRONG class=k>if</STRONG> (!rsps.ContentType.ToLower().StartsWith(<STRONG class=s>"text/"</STRONG>) || rsps.ContentLength > 1 << 22)
- {
- rsps.Close();
- m_good = <STRONG class=k>false</STRONG>;
- <STRONG class=k>return</STRONG>;
- }
- Encoding cding = System.Text.Encoding.Default;
- <STRONG class=k>int</STRONG> ix = rsps.ContentType.ToLower().IndexOf(<STRONG class=s>"charset="</STRONG>);
- <STRONG class=k>if</STRONG> (ix != -1)
- {
- <STRONG class=k>try</STRONG>
- {
- cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + <STRONG class=s>"charset"</STRONG>.Length + 1));
- }
- <STRONG class=k>catch</STRONG>
- {
- cding = Encoding.Default;
- }
- }
-
-
- m_html = <STRONG class=k>new</STRONG> StreamReader(sm, cding).ReadToEnd();
-
-
- m_pagesize = m_html.Length;
- m_uri = rsps.ResponseUri;
- rsps.Close();
- }
- <STRONG class=k>catch</STRONG> (Exception ex)
- {
- Console.WriteLine(ex.Message+m_uri.ToString());
- m_good = <STRONG class=k>false</STRONG>;
-
- }
- <STRONG class=r> #endregion
- </STRONG> }
-
- }
- <STRONG class=r>
- #endregion
- </STRONG><STRONG class=r>
-
- #region 屬性
- </STRONG>
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> URL
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_uri.AbsoluteUri;
- }
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> Title
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>if</STRONG> (m_title == <STRONG class=s>""</STRONG>)
- {
- Regex reg = <STRONG class=k>new</STRONG> Regex(<STRONG class=s>@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>"</STRONG>, RegexOptions.Multiline | RegexOptions.IgnoreCase );
- Match mc = reg.Match(m_html);
- <STRONG class=k>if</STRONG> (mc.Success)
- m_title= mc.Groups[<STRONG class=s>"title"</STRONG>].Value.Trim();
- }
- <STRONG class=k>return</STRONG> m_title;
- }
- }
-
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> List<Link> Links
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>if</STRONG> (m_links.Count == 0) getLinks();
- <STRONG class=k>return</STRONG> m_links;
- }
- }
-
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> Context
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>if</STRONG> (m_outstr == <STRONG class=s>""</STRONG>) getContext(Int16.MaxValue);
- <STRONG class=k>return</STRONG> m_outstr;
- }
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>int</STRONG> PageSize
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_pagesize;
- }
- }
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> List<Link> InsiteLinks
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> getSpecialLinksByUrl("^http:<STRONG class=c>//"+m_uri.Host,Int16.MaxValue);
- </STRONG> }
- }
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>bool</STRONG> IsGood
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_good;
- }
- }
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> Host
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_uri.Host;
- }
- }
-
-
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> PostStr
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_post;
- }
- }
- <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> LoginURL
- {
- <STRONG class=k>get</STRONG>
- {
- <STRONG class=k>return</STRONG> m_loginurl;
- }
- }
- <STRONG class=r> #endregion
- </STRONG>}
-
- <STRONG class=c>
- </STRONG><STRONG class=c>
- </STRONG><STRONG class=c>
- </STRONG><STRONG class=k>public</STRONG> <STRONG class=k>class</STRONG> Link
- {
- <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> url; <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> <STRONG class=k>string</STRONG> text; <STRONG class=c>
- </STRONG> <STRONG class=k>public</STRONG> Link(<STRONG class=k>string</STRONG> _url, <STRONG class=k>string</STRONG> _text)
- {
- url = _url;
- text = _text;
- }
- }