Office中国论坛/Access中国论坛
标题:
练习用正则表达式将网页table转换到DataGridView
[打印本页]
作者:
tianping
时间:
2014-4-17 00:26
标题:
练习用正则表达式将网页table转换到DataGridView
本帖最后由 tianping 于 2014-4-17 00:33 编辑
小辰同学要求将一个网页的表格数据抓取下来,
故做此练习。请看原网页及转换结果
[attach]53829[/attach][attach]53830[/attach]
作者:
tianping
时间:
2014-4-17 00:27
本帖最后由 tianping 于 2014-4-17 00:28 编辑
原代码
包含一小段抓取网页html代码(未包含登录)
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
namespace NetCapture
{
public partial class Form2 : Form
{
public Form2()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
this.dataGridView1.Columns.Clear();
//定义表、表头、表体、表头单元格、数据行、数据单元格正则表达式
Regex tableR =new Regex( @"<table[\s\S]*?</table>",RegexOptions.Multiline);
Regex theadR = new Regex(@"<thead.*?</thead>");
Regex headCellR = new Regex(@"(?<=<th.*?>)[^>]*?(?=</th>)");
Regex rowR = new Regex(@"<tr.*?</tr>");
Regex cellR = new Regex(@"(?<=<td.*?>)[^>]*?(?=</td>)");
//定义变量
int tableID = 0;
int rowID = 0;
int columnNum = 0;
Dictionary<int,string> tableStrs = new Dictionary<int,string>();
Dictionary<int, string> headStrs = new Dictionary<int, string>();
Dictionary<int,string> rowStrs = new Dictionary<int, string>();
Dictionary<int, string> cellStrs = new Dictionary<int, string>();
string htmlStr = Regex.Replace(this.richTextBox1.Text,@"(\s)|(</a>)|(</font>)",""); //去掉所有的</a>和空格
MatchCollection tableMatchs = tableR.Matches(htmlStr);
if (tableMatchs.Count<1)return;
// 取得匹配表
foreach (Match s in tableMatchs)
tableStrs[tableID++]=s.Value;
//处理第一个表,分为行
MatchCollection rowMatchs = rowR.Matches(tableStrs[0]);
foreach (Match s in rowMatchs)
rowStrs[rowID++] = s.Value;
//取表头
MatchCollection columnMatchs = headCellR.Matches(rowStrs[0]);
//初始datagridview列
columnNum = columnMatchs.Count;
DataGridViewTextBoxColumn col;
for (int i = 1; i <columnNum; i++)
{
col = new DataGridViewTextBoxColumn();
col.Name ="col"+ i.ToString();
col.HeaderText =columnMatchs[i].Value;
this.dataGridView1.Columns.Add(col);
}
//处理每数据行成单元格
for (int i = 1; i < rowStrs.Count; i++)
{
MatchCollection cellMatchs=cellR.Matches(rowStrs[i]);
int columnID = 0;
int index = this.dataGridView1.Rows.Add();
for (int j = 1; j < columnNum; j++)
{
this.dataGridView1.Rows[index].Cells[columnID++].Value = cellMatchs[j].Value;
}
}
}
private void button2_Click(object sender, EventArgs e)
{
WebRequest request = WebRequest.Create(textBox2.Text);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
this.richTextBox1.Text = reader.ReadToEnd();
reader.Close();
reader.Dispose();
response.Close();
}
}
}
复制代码
欢迎光临 Office中国论坛/Access中国论坛 (http://www.office-cn.net/)
Powered by Discuz! X3.3