`
guoxinzz
  • 浏览: 430978 次
  • 性别: Icon_minigender_1
  • 来自: 深圳
社区版块
存档分类
最新评论

c#抓取网页分析

 
阅读更多

目的:
抓取网页,分析网页内容,进行处理获取信息。
例子:
抓km169上的adsl用户的费用信息,分析存储到本地数据库。
步骤:1、抓取。2、分析。3、存储。


2006-2-13 05:48

1抓取
public string GetPage(string url, string postData, out string err)
{
err = "";
Stream outstream = null;
Stream instream = null;
StreamReader sr = null;
HttpWebResponse response = null;
HttpWebRequest request = null;
Encoding encoding = Encoding.Default;
byte[] data = encoding.GetBytes(postData);
// 准备请求...
try
{
// 设置参数
request = WebRequest.Create(url) as HttpWebRequest;
CookieContainer cookieContainer = new CookieContainer();
request.CookieContainer = cookieContainer;
request.AllowAutoRedirect = true;
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
outstream = request.GetRequestStream();
outstream.Write(data, 0, data.Length);
outstream.Close();
//发送请求并获取相应回应数据
response = request.GetResponse() as HttpWebResponse;
//直到request.GetResponse()程序才开始向目标网页发送Post请求
instream = response.GetResponseStream();
sr = new StreamReader(instream, encoding);
//返回结果网页(html)代码
string content = sr.ReadToEnd();
err = string.Empty;
return content;
}
catch (Exception ex)
{
err = ex.Message;
return string.Empty;
}
}
[[i] Last edited by 王 on 2006-2-13 at 13:49 [/i]]


2006-2-13 05:56

2、分析
public string Get()
{
string str = GetPage(KMADSLURL, strReq, out err);
Regex rgx = new Regex("table_det//(//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/"//);", RegexOptions.Singleline);
foreach (Match m in rgx.Matches(str))
{
Rec r = new Rec();
r.str1 = m.Groups[1].Value;
r.Save();
}
return null;
}
此处的关键在于正则表达式,利用匹配关系获得一条条记录,再用%1~%9分组,得到每个字段的内容,最后生成相应的记录即可(拼sql也可),这里用了个持久化的咚咚,下次详细说。
正则技巧:用^(间隔符号)来划分字段,:)不大好解释,大家自己体会下吧。
[[i] Last edited by 王 on 2006-2-13 at 13:58 [/i]]

Timothy
2006-2-14 01:44

我以前写了个多线程批量下载歌曲的程序,当时程序考虑的是挂接百渡,同时又预留了扩展性,比如通过配置也可以获取雅虎的歌曲,这就好考虑到个网站网页的编码方式,和的一样,我也是用了HttpWebResponse 类.通过对各种编码的网页在2进制下面的观察,发现前2个字节不同,所以转换成STRING时候需要特殊处理,否则中文有乱码
以下是我对几种常用的编码进行的分析
//获取源代码的编码类别
//UNICODE
if(b[0]==0xFF && b[1]==0xFE)
{
return System.Text.Encoding.Unicode.GetString(b,0,b.Length);
}
//UNICODE BIG ENDIAN
else if(b[0]==0xFE && b[1]==0xFF)
{
return System.Text.Encoding.BigEndianUnicode.GetString(b,0,b.Length);
}
//UTF8
else if(b[0]==0xEF && b[1]==0xBB)
{
return System.Text.Encoding.UTF8.GetString(b,0,b.Length);
}
//DEFAULT ANSII
else
{
return System.Text.Encoding.Default.GetString(b,0,b.Length);
}

Timothy
2006-2-14 01:49

其中b是网页源代码的以二进制方式读取的数组


2006-2-14 03:39

小强~呵呵,

Timothy
2006-2-14 05:38

这样的也算小强阿,我一向都把你当我的偶像来的:)


2006-2-16 09:01

你是我偶像,把你那个程序详细分析下给大家看看嘛。

Timothy
2006-2-16 10:14

我开始是想把它完善,然后和我的网站绑定,然后通过在程序开始或者结束弹出广告
来赚钱,当我完成了一个模型的时候,就是能够从百度对它的几个分类(TOP500什么的)进行批量下载的时候,
我同事说已经有人作了,我下载了看了下挺好的,不过还不人性化,有些需求没有考虑到,
比如只能挂接BAIDU,没有灵活配置的接口,使可以挂接其他网站,比如古典音乐
网站什么的.但是很小,才300k,而我的.net作的东西太大了,有FREAMEWORK都快接近25M了,
别人说什么也不会用,所以就暂停了,等我有时间多用VC重新弄个人性化的来玩玩.
现在要弄微软路线的BI了,看看数据挖掘和报表服务,耽误了工作以后就麻烦了.
百度也过分,知道有人老是通过它来下载歌曲,源代码的结构经常改,有两个办法,一个是
你也经常去分析他的源代码,发现变了,你赶快修改你的配置文件,其中配置文件放在
你的网站上,下载歌曲的程序运行时通过后台线程去读取它.这样就是累.另外一个办法就是
通过人工智能分析,比如读取TOP500页面后,上面有几百首歌,让程序自动去分析那些是
歌曲的显示名称,那些是连接的URL,哪些是序号什么的,还有歌词URL,这样就对百度不变应万变了,
呵呵,需要一定技术
今天玩台球去了,明天贴点源代码,主要是太多了,除非画个UML图.

Timothy
2006-2-17 01:32

我开始也写了你和你一样的单线程下载的类,不过后来通过增加一个委托,使他能够实现同步和异步的调用,后来我在网上下载了一个兄弟的源代码,他的是异步多线程调用,不过代码有BUG,不能能多线程下载,但是架构很优美,我花了一个晚上时间把它修改了一下,为了错误重试我个人方便,我增加了个别方法破坏了他的封装性,有兴趣看他的代码的朋友可以在下面看
namespace Mp3Crazy
{
using System;
/// <summary><br> /// 包含 Exception 事件数据的类 <br> /// </summary>
public class ExceptionEventArgs : System.EventArgs
{
private System.Exception _Exception;
private ExceptionActions _ExceptionAction;
private DownLoadState _DownloadState;
public DownLoadState DownloadState
{
get
{
return _DownloadState;
}
}
public Exception Exception
{
get
{
return _Exception;
}
}
public ExceptionActions ExceptionAction
{
get
{
return _ExceptionAction;
}
set
{
_ExceptionAction = value;
}
}
internal ExceptionEventArgs(System.Exception e, DownLoadState DownloadState)
{
this._Exception = e;
this._DownloadState = DownloadState;
}
}
/// <summary><br> /// 包含 DownLoad 事件数据的类 <br> /// </summary>
public class DownLoadEventArgs : System.EventArgs
{
private DownLoadState _DownloadState;
public DownLoadState DownloadState
{
get
{
return _DownloadState;
}
}
public DownLoadEventArgs(DownLoadState DownloadState)
{
this._DownloadState = DownloadState;
}
}
public class ThreadProcessEventArgs : System.EventArgs
{
private string _id;
public ThreadProcessEventArgs(string id)
{
this._id=id;
}
}
}

Timothy
2006-2-17 01:33

namespace Mp3Crazy
{
using System;
/// <summary><br> /// 记录下载的字节位置 <br> /// </summary>
public class DownLoadState
{
private string _FileName;
private string _AttachmentName;
private int _Position;
private string _RequestURL;
private string _ResponseURL;
private int _Length;
private byte[] _Data;
public string FileName
{
get
{
return _FileName;
}
}
public int Position
{
get
{
return _Position;
}
}
public int Length
{
get
{
return _Length;
}
}
public string AttachmentName
{
get
{
return _AttachmentName;
}
}
public string RequestURL
{
get
{
return _RequestURL;
}
}
public string ResponseURL
{
get
{
return _ResponseURL;
}
}
public byte[] Data
{
get
{
return _Data;
}
}
internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, byte[] Data)
{
this._FileName = FileName;
this._RequestURL = RequestURL;
this._ResponseURL = ResponseURL;
this._AttachmentName = AttachmentName;
this._Position = Position;
this._Data = Data;
this._Length = Length;
}
internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, ThreadCallbackHandler tch)
{
this._RequestURL = RequestURL;
this._ResponseURL = ResponseURL;
this._FileName = FileName;
this._AttachmentName = AttachmentName;
this._Position = Position;
this._Length = Length;
this._ThreadCallback = tch;
}
internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length)
{
this._RequestURL = RequestURL;
this._ResponseURL = ResponseURL;
this._FileName = FileName;
this._AttachmentName = AttachmentName;
this._Position = Position;
this._Length = Length;
}
private ThreadCallbackHandler _ThreadCallback;
public HttpWebClient httpWebClient
{
get
{
return this._hwc;
}
set
{
this._hwc = value;
}
}
private HttpWebClient _hwc;
internal void StartDownloadFileChunk()
{
if (this._ThreadCallback != null)
{
this._ThreadCallback(this._RequestURL, this._FileName, this._Position, this._Length);
this._hwc.OnThreadProcess("");
}
}
}
}

Timothy
2006-2-17 01:33

/* .Net/C#: 实现支持断点续传多线程下载的工具类
* Reflector 了一下 System.Net.WebClient ,改写或增加了若干:
* DownLoad、Upload 相关方法!
* 增加了 DataReceive、ExceptionOccurrs 事件
*/
namespace Mp3Crazy
{
using System;
using System.IO;
using System.Net;
using System.Text;
using System.Security;
using System.Threading;
using System.Collections.Specialized;
//委托代理线程的所执行的方法签名一致
public delegate void ThreadCallbackHandler(string S, string s, int I, int i);
//异常处理动作
public enum ExceptionActions
{
Throw,
CancelAll,
Ignore,
Retry
}
/// <summary><br> /// 支持断点续传多线程下载的类 <br> /// </summary>
public class HttpWebClient
{
public delegate void ExceptionEventHandler(HttpWebClient Sender, ExceptionEventArgs e);
public event ExceptionEventHandler ExceptionOccurrs; //发生异常事件
public delegate void ThreadProcessEventHandler(HttpWebClient Sender, ThreadProcessEventArgs e);
public event ThreadProcessEventHandler ThreadProcessEnd; //发生多线程处理完毕事件
private int _FileLength,_getLength; //下载文件的总大小
public int TimeOut=20000;
public int SongID=0;
public bool UrlParsed;
public string FileName;
public bool Free=true;
public int RetryTimes;
public int TBlocks=1,curBlock;
public int FileLength
{
get
{
return _FileLength;
}
}
public int GetLength
{
get
{
return _getLength;
}
}
[[i] Last edited by Timothy on 2006-2-17 at 09:48 [/i]]

Timothy
2006-2-17 01:34

/// <summary><br> /// 分块下载文件 <br> /// </summary>
/// URL 地址
/// 保存到本地的路径文件名
/// 块数,线程数
public void DownloadFile(string Address, string FileName, int ChunksCount)
{
int p = 0; // position
int s = 0; // chunk size
_getLength=0;
string a = null;
HttpWebRequest hwrq;
HttpWebResponse hwrp = null;
try
{
hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address));
hwrq.Timeout=TimeOut;
hwrp = (HttpWebResponse) hwrq.GetResponse();
//hwrq=null;
long L = hwrp.ContentLength;
hwrq.Credentials = this.m_credentials;
L = ((L == -1) || (L > 0x7fffffff)) ? ((long) 0x7fffffff) : L; //Int32.MaxValue 该常数的值为 2,147,483,647; 即十六进制的 0x7FFFFFFF
int l = (int) L;
this._FileLength = l;
bool b = true;//(hwrp.Headers["Accept-Ranges"] != null && hwrp.Headers["Accept-Ranges"] == "bytes");
a = hwrp.Headers["Content-Disposition"]; //attachment
if (a != null)
{
a = a.Substring(a.LastIndexOf("filename=") + 9);
}
else
{
a = FileName;
}
int ss = s;
if (b)
{
s = l / ChunksCount;
if (s {
s = 2 * 64 * 1024;
}
ss = s;
int i = 0;
while (l >= s)
{
l -= s;
if (l {
s += l;
}
if (i++ > 0)
{
DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, p, s, new ThreadCallbackHandler(this.DownloadFileChunk));
// 单线程下载
// x.StartDownloadFileChunk();
x.httpWebClient = this;
//多线程下载
Thread t = new Thread(new ThreadStart(x.StartDownloadFileChunk));
//this.OnThreadProcess(t);
t.Start();
}
p += s;
}
s = ss;
this.ResponseAsBytes(Address, hwrp, s, FileName);
this.OnThreadProcess("");
}
}
catch (Exception e)
{
if (this.ExceptionOccurrs != null)
{
string path="";
if(hwrp!=null)
path=hwrp.ResponseUri.AbsolutePath;
DownLoadState x = new DownLoadState(Address,path, FileName, a, p, s);
ExceptionEventArgs eea = new ExceptionEventArgs(e, x);
ExceptionOccurrs(this, eea);
}
}
}
internal void OnThreadProcess(string id)
{
if (ThreadProcessEnd != null)
{
ThreadProcessEventArgs tpea = new ThreadProcessEventArgs(id);
ThreadProcessEnd(this, tpea);
}
}
/// <summary><br> /// 下载一个文件块,利用该方法可自行实现多线程断点续传 <br> /// </summary>
/// URL 地址
/// 保存到本地的路径文件名
/// 块大小
public void DownloadFileChunk(string Address, string FileName, int FromPosition, int Length)
{
HttpWebResponse hwrp = null;
string a = null;
try
{
//this._FileName = FileName;
HttpWebRequest hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address));
//hwrq.Credentials = this.m_credentials;
hwrq.AddRange(FromPosition);
hwrp = (HttpWebResponse) hwrq.GetResponse();
hwrq=null;
a = hwrp.Headers["Content-Disposition"]; //attachment
if (a != null)
{
a = a.Substring(a.LastIndexOf("filename=") + 9);
}
else
{
a = FileName;
}
this.ResponseAsBytes(Address, hwrp, Length, FileName);
}
catch (Exception e)
{
if (this.ExceptionOccurrs != null)
{
DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, FromPosition, Length);
ExceptionEventArgs eea = new ExceptionEventArgs(e, x);
ExceptionOccurrs(this, eea);
}
}
}
internal void ResponseAsBytes(string RequestURL, WebResponse Response, long Length, string FileName)
{
string a = null; //AttachmentName
int P = 0; //整个文件的位置指针
int num2 = 0;
try
{
a = Response.Headers["Content-Disposition"]; //attachment
if (a != null)
{
a = a.Substring(a.LastIndexOf("filename=") + 9);
}
int p = 0; //本块的位置指针
int num1=(int)Length;
byte[] buffer1 = new byte[30000];
string s = Response.Headers["Content-Range"];
if (s != null)
{
s = s.Replace("bytes ", "");
s = s.Substring(0, s.IndexOf("-"));
P = Convert.ToInt32(s);
}
Stream S = Response.GetResponseStream();
System.IO.FileStream sw = new System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, System.IO.FileShare.ReadWrite);
//Console.WriteLine("P:{0}",P);
do
{
num2 = S.Read(buffer1, 0,30000);
if (num2 > 0)
{
sw.Position = P;
sw.Write(buffer1,0,num2);
p += num2; //本块的位置指针
P += num2; //整个文件的位置指针
_getLength+=num2;
//Console.WriteLine("{0}",(_getLength*100/_FileLength));
}
else
{
break;
}
}
while (num1>p);
sw.Close();
S.Close();
buffer1=null;
sw=null;
S = null;
Response = null;
}
catch (Exception e)
{
if (this.ExceptionOccurrs != null)
{
DownLoadState x = new DownLoadState(RequestURL, Response.ResponseUri.AbsolutePath, FileName, a, P, num2);
ExceptionEventArgs eea = new ExceptionEventArgs(e, x);
ExceptionOccurrs(this, eea);
}
}
}
private byte[] ResponseAsBytes(WebResponse response)
{
int num2;
long num1 = response.ContentLength;
bool flag1 = false;
if (num1 == -1)
{
flag1 = true;
num1 = 0x10000;
}
byte[] buffer1 = new byte[(int) num1];
Stream stream1 = response.GetResponseStream();
int num3 = 0;
do
{
num2 = stream1.Read(buffer1, num3, ((int) num1) - num3);
num3 += num2;
if (flag1 && (num3 == num1))
{
num1 += 0x10000;
byte[] buffer2 = new byte[(int) num1];
Buffer.BlockCopy(buffer1, 0, buffer2, 0, num3);
buffer1 = buffer2;
}
}
while (num2 != 0);
stream1.Close();
if (flag1)
{
byte[] buffer3 = new byte[num3];
Buffer.BlockCopy(buffer1, 0, buffer3, 0, num3);
buffer1 = buffer3;
}
return buffer1;
}
private NameValueCollection m_requestParameters;
private Uri m_baseAddress;
private ICredentials m_credentials = CredentialCache.DefaultCredentials;
public ICredentials Credentials
{
get
{
return this.m_credentials;
}
set
{
this.m_credentials = value;
}
}
public NameValueCollection QueryString
{
get
{
if (this.m_requestParameters == null)
{
this.m_requestParameters = new NameValueCollection();
}
return this.m_requestParameters;
}
set
{
this.m_requestParameters = value;
}
}
public string BaseAddress
{
get
{
if (this.m_baseAddress != null)
{
return this.m_baseAddress.ToString();
}
return string.Empty;
}
set
{
if ((value == null) || (value.Length == 0))
{
this.m_baseAddress = null;
}
else
{
try
{
this.m_baseAddress = new Uri(value);
}
catch (Exception exception1)
{
throw new ArgumentException("value", exception1);
}
}
}
}
private Uri GetUri(string path)
{
Uri uri1;
try
{
if (this.m_baseAddress != null)
{
uri1 = new Uri(this.m_baseAddress, path);
}
else
{
uri1 = new Uri(path);
}
if (this.m_requestParameters == null)
{
return uri1;
}
StringBuilder builder1 = new StringBuilder();
string text1 = string.Empty;
for (int num1 = 0; num1 {
builder1.Append(text1 + this.m_requestParameters.AllKeys[num1] + "=" + this.m_requestParameters[num1]);
text1 = "&";
}
UriBuilder builder2 = new UriBuilder(uri1);
builder2.Query = builder1.ToString();
uri1 = builder2.Uri;
}
catch (UriFormatException)
{
uri1 = new Uri(Path.GetFullPath(path));
}
return uri1;
}
}
}

Timothy
2006-2-17 01:45

这个是如何使用的一个simple demo
namespace MultiThread
{
using System;
using System.IO;
using System.Net;
using System.Text;
using System.Security;
using System.Threading;
using System.Collections;
using System.Collections.Specialized;
using Mp3Crazy;
/// <summary><br> /// 测试类 <br> /// </summary>
class AppTest
{
int _k = 0;
int _K = 0;
static void Main()
{
//FileStream fs=File.
//fs.
//System.Text.Encoding.UTF8
// int i;
// DBAccess.InitConn();
// for(i=0;i// {
// DBAccess.ExcuteNoquery("insert into Node(Name,FID,Url,IsSinger) values('"+(char)(i+65)+"',10,'http://list.mp3.baidu.com/song/"+(char)(i+65)+".htm',0)");
// }
// DBAccess.DestroyConn();
// return;
go();
GC.Collect();
string str=System.Console.ReadLine();
//for(int j=0;;);
// string uploadfile = "e://test_local.rar";
// string str = x.UploadFileEx("http://localhost/phpmyadmin/uploadaction.php", "POST", uploadfile, "file1");
// System.Console.WriteLine(str);
// System.Console.ReadLine();
}
private static void go()
{
int i=0;
for(i=0;i {
AppTest a = new AppTest();
HttpWebClient x = new HttpWebClient();
x.TBlocks=2;
x.curBlock=0;
a._K =2;
x.TimeOut=10000;
//订阅 DataReceive 事件
//x.DataReceive += new Microshaoft.Utils.HttpWebClient.DataReceiveEventHandler(a.x_DataReceive);
//订阅 ExceptionOccurrs 事件
x.ExceptionOccurrs += new HttpWebClient.ExceptionEventHandler(a.x_ExceptionOccurrs);
x.ThreadProcessEnd += new HttpWebClient.ThreadProcessEventHandler(a.x_ThreadProcessEnd);
string F = "http://localhost/gsx.MP3";
a._F = F;
string f = F.Substring(F.LastIndexOf("/") + 1)+i.ToString();
a._f =f;
//(new System.Threading.Thread(new System.Threading.ThreadStart(new ThreadProcessState(F, @"E:/temp/" + f, 10, x).StartThreadProcess))).Start();
x.DownloadFile(F, @"E:/" + f, a._K);
}
}
string _F;
string _f;
private void x_ExceptionOccurrs(HttpWebClient Sender, ExceptionEventArgs e)
{
System.Console.WriteLine(e.Exception.Message);
//发生异常重新下载相当于断点续传,你可以自己自行选择处理方式
HttpWebClient x = new HttpWebClient();
x.DownloadFileChunk(this._F, this._f, e.DownloadState.Position, e.DownloadState.Length);
e.ExceptionAction = ExceptionActions.Ignore;
}
private void x_ThreadProcessEnd(HttpWebClient Sender, ThreadProcessEventArgs e)
{
//if (e.thread.ThreadState == System.Threading.ThreadState.Stopped)
//if (this._k ++ == this._K - 1)
if(Sender.curBlock++==Sender.TBlocks-1)
System.Console.WriteLine("end/n");
GC.Collect();
}
}
class Test
{
public static void Main2()
{
// Create a new 'HttpWebRequest' Object to the mentioned URL.
HttpWebRequest myHttpWebRequest=(HttpWebRequest)WebRequest.Create("http://localhost/default.aspx");
myHttpWebRequest.AddRange(50,100);
// Assign the response object of 'HttpWebRequest' to a 'HttpWebResponse' variable.
HttpWebResponse myHttpWebResponse=(HttpWebResponse)myHttpWebRequest.GetResponse();
bool b=(myHttpWebRequest.Headers["Range"] != null && myHttpWebRequest.Headers["Range"].ToLower().StartsWith( "bytes"));
Console.WriteLine("/nThe HttpHeaders are /n/n/tName/t/tValue/n{0}",myHttpWebRequest.Headers);
IEnumerator ie=myHttpWebRequest.Headers.GetEnumerator();
while(ie.MoveNext())
Console.WriteLine("key:{0}",ie.Current.ToString());
// Print the HTML contents of the page to the console.
long i=myHttpWebResponse.ContentLength;
Stream streamResponse=myHttpWebResponse.GetResponseStream();
StreamReader streamRead = new StreamReader( streamResponse );
Char[] readBuff = new Char[256];
int count = streamRead.Read( readBuff, 0, 256 );
Console.WriteLine("/nThe HTML contents of page the are : /n/n ");
while (count > 0)
{
String outputData = new String(readBuff, 0, count);
Console.Write(outputData);
count = streamRead.Read(readBuff, 0, 256);
}
// Close the Stream object.
streamResponse.Close();
streamRead.Close();
// Release the HttpWebResponse Resource.
myHttpWebResponse.Close();
Console.ReadLine();
}
}
}

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics