一区二区久久-一区二区三区www-一区二区三区久久-一区二区三区久久精品-麻豆国产一区二区在线观看-麻豆国产视频

asp.net(c#)做一個網頁數據采集工具

通過這個軟件一兩天就完成了幾千產品數據的錄入,可見很多工作不是一味用人工去做,作為一個程序員,就是要讓很多讓那些經常做重復性的、繁瑣的工作中的人解放出來。下面只是寫了一些核心代碼,而且采集必須要和對應網站相掛鉤,作者:鄭少群

復制代碼 代碼如下:
//提取產品列表頁中產品最終頁的網頁
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("網址和域名不能為空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.NET.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取鏈接


" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("/"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "/r/n");
}
textBox5.Text = sb.ToString();//把提取到網址輸出到一個textBox,每個鏈接占一行



MessageBox.Show("共提取" + al.Count.ToString() + "個鏈接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

}
catch (Exception err)
{
MessageBox.Show("提取出錯!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}

}




//把采集的產品頁面html代碼進行字符串處理,提取需要的代碼,最后保存到本地一個access數據庫中,同時提取產品圖片地址并自動現在圖片到本地images文件夾下

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充產品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//這個是做一個進度條

string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

//循環每次采集網址
for (int i = 0; i < Urls.Length; i++)
{
try
{
if (!worker.CancellationPending)
{
if (Urls[i] == "")
return;
html = inc.GetHtml(Urls[i]);//獲取該url的html代碼
DataRow NewRow = dt2.NewRow();

//產品名
string ProductName = html.Substring(html.IndexOf("<title>") + 7);
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//產品編號
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//產品介紹,這些都是根據不同網站的html做相應的修改
string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;



" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
//下載圖片
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5);
ProductImage = ProductImage.Remove(ProductImage.IndexOf("/""));
try
{
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
}
catch (Exception)
{
ErrorStr.Append("下載圖片失敗,圖片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n");
}


dt2.Rows.Add(NewRow);

//Thread.Sleep(100);
worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
toolStripStatusLabel1.Text = "處理進度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//進度條
}

}
catch (Exception err)
{
ErrorStr.Append("采集錯誤:" + err.Message + ";網址:" + Urls[i] + "/r/n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}

/// <summary>
/// ASPX頁面生成靜態Html頁面,作者:鄭少群
/// </summary>
public static string GetHtml(string url)
{
StreamReader sr = null;
string str = null;
//讀取遠程路徑
WebRequest request = WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
str = sr.ReadToEnd();
sr.Close();
return str;
}


// 提取HTML代碼中的網址
public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
{
ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++)
{
bool rep = false;
string strNew = m[i].ToString();

// 過濾重復的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}

if (!rep) al.Add(strNew);
}

al.Sort();

return al;
}

public static void DownFile(string Url, string Path)
{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
long size = response.ContentLength;
//創建文件流對象
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] b = new byte[1025];
int n = 0;
while ((n = stream.Read(b, 0, 1024)) > 0)
{
fs.Write(b, 0, n);
}
}
}

AspNet技術asp.net(c#)做一個網頁數據采集工具,轉載需保留來源!

鄭重聲明:本文版權歸原作者所有,轉載文章僅為傳播更多信息之目的,如作者信息標記有誤,請第一時間聯系我們修改或刪除,多謝。

主站蜘蛛池模板: 亚洲欧美激情图片 | 国产日韩欧美综合色视频在线 | 中文字幕一区二区三区 精品 | 亚欧精品一区二区三区四区 | 麻豆视频大全 | 婷婷激情片 | 一区二区三区舞蹈区 | 久久精品国产福利 | 丁香六月婷婷 | 色视频网站在线 | 国产成品精品午夜视频 | 国产成人精品一区二区不卡 | 午夜爽爽性刺激一区二区视频 | 亚洲国产精品线在线观看 | 手机在线一区二区三区 | 一本久道久久综合中文字幕 | 国产成人盗拍精品免费视频 | 国产成人高清视频 | 久久中文精品 | 欧美激情片网站 | 91福利一区| 四虎永久免费最新在线 | 国产黄大片 | 亚洲男人的天堂久久香蕉 | 成人午夜精品 | 欧美日韩国产一区二区三区在线观看 | 色呦呦在线免费观看 | 国产亚洲精品国产福利在线观看 | 天天添天天操 | 小说区图片区综合久久88 | 91精品一区二区三区在线 | 丁香六月在线 | 九九99久久精品国产 | 亚洲视色| 久久中文网 | 亚洲福利视频网站 | 日韩精品高清自在线 | 国产免费观看视频 | 午夜国产小视频 | 欧美特黄三级在线观看 | 日本韩国三级在线 |