private DataSet ConvertHTMLTablesToDataSet(string HTML) {
// Declarations
DataSet ds = new DataSet();
DataTable dt = null;
DataRow dr = null;
DataColumn dc = null;
string TableExpression = "<table[^>]*>(.*?)</table>";
string HeaderExpression = "<th[^>]*>(.*?)</th>";
string RowExpression = "<tr[^>]*>(.*?)</tr>";
string ColumnExpression = "<td[^>]*>(.*?)</td>";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;
// Get a match for all the tables in the HTML
MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each table element
foreach(Match Table in Tables) {
// Reset the current row counter and the header flag
iCurrentRow = 0;
HeadersExist = false;
// Add a new table to the DataSet
dt = new DataTable();
//Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)
// if (Table.Value.Contains("<th"))
if (Table.Value.Contains("<th")) {
// Set the HeadersExist flag
HeadersExist = true;
// Get a match for all the rows in the table
MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each header element
foreach(Match Header in Headers) {
dt.Columns.Add(Header.Groups[1].ToString());
}
} else {
for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++) {
dt.Columns.Add("Column " + iColumns);
}
}
//Get a match for all the rows in the table
MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each row element
foreach(Match Row in Rows) {
// Only loop through the row if it isn't a header row
if (!(iCurrentRow == 0 && HeadersExist)) {
// Create a new row and reset the current column counter
dr = dt.NewRow();
iCurrentColumn = 0;
// Get a match for all the columns in the row
MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each column element
foreach(Match Column in Columns) {
// Add the value to the DataRow
dr[iCurrentColumn] = Column.Groups[1].ToString();
// Increase the current column
iCurrentColumn++;
}
// Add the DataRow to the DataTable
dt.Rows.Add(dr);
}
// Increase the current row counter
iCurrentRow++;
}
// Add the DataTable to the DataSet
ds.Tables.Add(dt);
}
return ds;
}
// Declarations
DataSet ds = new DataSet();
DataTable dt = null;
DataRow dr = null;
DataColumn dc = null;
string TableExpression = "<table[^>]*>(.*?)</table>";
string HeaderExpression = "<th[^>]*>(.*?)</th>";
string RowExpression = "<tr[^>]*>(.*?)</tr>";
string ColumnExpression = "<td[^>]*>(.*?)</td>";
bool HeadersExist = false;
int iCurrentColumn = 0;
int iCurrentRow = 0;
// Get a match for all the tables in the HTML
MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each table element
foreach(Match Table in Tables) {
// Reset the current row counter and the header flag
iCurrentRow = 0;
HeadersExist = false;
// Add a new table to the DataSet
dt = new DataTable();
//Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)
// if (Table.Value.Contains("<th"))
if (Table.Value.Contains("<th")) {
// Set the HeadersExist flag
HeadersExist = true;
// Get a match for all the rows in the table
MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each header element
foreach(Match Header in Headers) {
dt.Columns.Add(Header.Groups[1].ToString());
}
} else {
for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase)[0].ToString(), ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase).Count; iColumns++) {
dt.Columns.Add("Column " + iColumns);
}
}
//Get a match for all the rows in the table
MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each row element
foreach(Match Row in Rows) {
// Only loop through the row if it isn't a header row
if (!(iCurrentRow == 0 && HeadersExist)) {
// Create a new row and reset the current column counter
dr = dt.NewRow();
iCurrentColumn = 0;
// Get a match for all the columns in the row
MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Loop through each column element
foreach(Match Column in Columns) {
// Add the value to the DataRow
dr[iCurrentColumn] = Column.Groups[1].ToString();
// Increase the current column
iCurrentColumn++;
}
// Add the DataRow to the DataTable
dt.Rows.Add(dr);
}
// Increase the current row counter
iCurrentRow++;
}
// Add the DataTable to the DataSet
ds.Tables.Add(dt);
}
return ds;
}
No comments:
Post a Comment